From 21737615e68fe68209f43398c372f8dae74e2dc4 Mon Sep 17 00:00:00 2001 From: Lukas Turcani Date: Wed, 28 Feb 2024 22:12:49 +0000 Subject: [PATCH] Support data frames (#62) --- docs/source/index.rst | 47 ++++++++++ pyproject.toml | 3 + src/atomlite/_internal/database.py | 67 ++++++++++++++ tests/test_database.py | 138 +++++++++++++++++++++++++++++ 4 files changed, 255 insertions(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 0527b01..de367c7 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -162,6 +162,53 @@ And retrieve them: * :meth:`.Database.get_entries`: For additional documentation. * :meth:`.Entry.from_rdkit`: For additional documentation. +Retrieving molecular properties as a DataFrame +.............................................. + +.. testsetup:: retrieving_properties + + import atomlite + db = atomlite.Database(":memory:") + import rdkit.Chem as rdkit + +We can retrieve the properties of molecules as a DataFrame: + +.. testcode:: retrieving_properties + + db.add_entries( + [ + atomlite.Entry.from_rdkit( + key="first", + molecule=rdkit.MolFromSmiles("C"), + properties={"num_atoms": 1, "is_interesting": False}, + ), + atomlite.Entry.from_rdkit( + key="second", + molecule=rdkit.MolFromSmiles("CN"), + properties={"num_atoms": 2, "is_interesting": True}, + ), + ] + ) + print(db.get_property_df(["$.num_atoms", "$.is_interesting"])) + +.. testoutput:: retrieving_properties + + shape: (2, 3) + ┌────────┬─────────────┬──────────────────┐ + │ key ┆ $.num_atoms ┆ $.is_interesting │ + │ --- ┆ --- ┆ --- │ + │ str ┆ i64 ┆ bool │ + ╞════════╪═════════════╪══════════════════╡ + │ first ┆ 1 ┆ false │ + │ second ┆ 2 ┆ true │ + └────────┴─────────────┴──────────────────┘ + +.. seealso:: + + * :meth:`.Database.get_property_df`: For additional documentation. + * `Valid property paths`_: For a description of the syntax used to + retrieve properties. + Updating molecular properties ............................. diff --git a/pyproject.toml b/pyproject.toml index b6252b8..01a675f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dev = [ "ruff", "mypy", "numpy", + "polars", "pytest", "pytest-cov", "sphinx", @@ -38,6 +39,8 @@ documentation = "https://atomlite.readthedocs.io" [tool.ruff] line-length = 79 + +[tool.ruff.lint] select = ["ALL"] ignore = ["ANN101", "COM812", "ISC001"] diff --git a/src/atomlite/_internal/database.py b/src/atomlite/_internal/database.py index 424cdb5..8feddce 100644 --- a/src/atomlite/_internal/database.py +++ b/src/atomlite/_internal/database.py @@ -3,8 +3,10 @@ import pathlib import sqlite3 import typing +from collections import defaultdict from dataclasses import dataclass, field +import polars as pl import rdkit.Chem as rdkit # noqa: N813 from atomlite._internal.json import Json, Molecule, json_from_rdkit @@ -298,6 +300,63 @@ def has_property_entry(self, key: str) -> bool: == 1 ) + def get_property_df( + self, + properties: collections.abc.Sequence[str], + *, + allow_missing: bool = False, + ) -> pl.DataFrame: + """Get a DataFrame of the properties in the database. + + Parameters: + properties: + The paths of the properties to retrieve. + Valid paths are described + `here `_. + You can also view various code + :ref:`examples` + in our docs. + allow_missing: + If ``True``, rows with some missing properties will be + included in the DataFrame and hold ``null`` values. + + Returns: + A DataFrame of the property entries in the database. + """ + columns = [] + params = [] + wheres = [] + for i, prop in enumerate(properties): + columns.append( + f"json_extract(properties,?) AS prop{i}," + f"json_type(properties,?) AS type{i}" + ) + params.append(prop) + params.append(prop) + wheres.append(f"prop{i} IS NOT NULL") + + select = ",".join(columns) + where = " OR ".join(wheres) if allow_missing else " AND ".join(wheres) + + data = defaultdict(list) + for key, *property_results in self.connection.execute( + f"SELECT key,{select} " # noqa: S608 + f"FROM {self._molecule_table} " + f"WHERE {where}", + params, + ): + data["key"].append(key) + for prop_name, prop_value, prop_type in _iter_props( + properties, property_results + ): + if prop_type in {"object", "array"}: + data[prop_name].append(json.loads(prop_value)) + elif prop_type in {"true", "false"}: + data[prop_name].append(bool(prop_value)) + else: + data[prop_name].append(prop_value) + return pl.DataFrame(data) + def get_entry(self, key: str) -> Entry | None: """Get a molecular entry from the database. @@ -874,3 +933,11 @@ def update_properties( ) if commit: self.connection.commit() + + +def _iter_props( + prop_names: collections.abc.Sequence[str], + props: collections.abc.Sequence[typing.Any], +) -> collections.abc.Iterator[tuple[str, typing.Any, str]]: + for name, i in zip(prop_names, range(0, len(props), 2), strict=True): + yield name, props[i], props[i + 1] diff --git a/tests/test_database.py b/tests/test_database.py index ffed048..25d8b0c 100644 --- a/tests/test_database.py +++ b/tests/test_database.py @@ -4,6 +4,8 @@ import atomlite import numpy as np +import polars as pl +import polars.testing as pl_testing import pytest import rdkit.Chem.AllChem as rdkit # noqa: N813 @@ -501,6 +503,142 @@ def _assert_conformers_match(expected: rdkit.Mol, actual: rdkit.Mol) -> None: ) +def test_get_property_df_and() -> None: + db = atomlite.Database(":memory:") + db.add_entries( + [ + atomlite.Entry.from_rdkit( + "first", + rdkit.MolFromSmiles("C"), + {"a": 1, "b": 10.0}, + ), + atomlite.Entry.from_rdkit( + "second", + rdkit.MolFromSmiles("CC"), + {"a": 2, "b": 20.0, "c": "hi second"}, + ), + atomlite.Entry.from_rdkit( + "third", + rdkit.MolFromSmiles("CCC"), + {"a": 3, "b": 30.0, "c": "hi third", "d": [1, 2, 3]}, + ), + atomlite.Entry.from_rdkit( + "fourth", + rdkit.MolFromSmiles("CCCC"), + {"a": 4, "b": 40.0, "e": {"a": 12, "b": 24}}, + ), + atomlite.Entry.from_rdkit( + "five", + rdkit.MolFromSmiles("CCCC"), + {"a": 4, "b": 40.0, "f": True}, + ), + atomlite.Entry.from_rdkit( + "six", + rdkit.MolFromSmiles("CCCC"), + {"a": 4, "b": 40.0, "f": False}, + ), + ] + ) + pl_testing.assert_frame_equal( + db.get_property_df(["$.a", "$.b"]), + pl.DataFrame( + { + "key": ["first", "second", "third", "fourth", "five", "six"], + "$.a": [1, 2, 3, 4, 4, 4], + "$.b": [10.0, 20.0, 30.0, 40.0, 40.0, 40.0], + } + ), + ) + pl_testing.assert_frame_equal( + db.get_property_df(["$.a", "$.b", "$.c"]), + pl.DataFrame( + { + "key": ["second", "third"], + "$.a": [2, 3], + "$.b": [20.0, 30.0], + "$.c": ["hi second", "hi third"], + } + ), + ) + pl_testing.assert_frame_equal( + db.get_property_df(["$.a", "$.b", "$.e"]), + pl.DataFrame( + { + "key": ["fourth"], + "$.a": [4], + "$.b": [40.0], + "$.e": [{"a": 12, "b": 24}], + } + ), + ) + pl_testing.assert_frame_equal( + db.get_property_df(["$.a", "$.b", "$.d"]), + pl.DataFrame( + { + "key": ["third"], + "$.a": [3], + "$.b": [30.0], + "$.d": [[1, 2, 3]], + } + ), + ) + pl_testing.assert_frame_equal( + db.get_property_df(["$.a", "$.b", "$.f"]), + pl.DataFrame( + { + "key": ["five", "six"], + "$.a": [4, 4], + "$.b": [40.0, 40.0], + "$.f": [True, False], + } + ), + ) + + +def test_get_property_df_or() -> None: + db = atomlite.Database(":memory:") + db.add_entries( + [ + atomlite.Entry.from_rdkit( + "first", + rdkit.MolFromSmiles("C"), + {"a": 1, "b": 10.0}, + ), + atomlite.Entry.from_rdkit( + "second", + rdkit.MolFromSmiles("CC"), + {"a": 2, "b": 20.0, "c": "hi second"}, + ), + atomlite.Entry.from_rdkit( + "third", + rdkit.MolFromSmiles("CCC"), + {"a": 3, "b": 30.0, "c": "hi third"}, + ), + ] + ) + pl_testing.assert_frame_equal( + db.get_property_df(["$.a", "$.b"], allow_missing=True), + pl.DataFrame( + { + "key": ["first", "second", "third"], + "$.a": [1, 2, 3], + "$.b": [10.0, 20.0, 30.0], + } + ), + ) + pl_testing.assert_frame_equal( + db.get_property_df(["$.a", "$.b", "$.c"], allow_missing=True), + pl.DataFrame( + { + "key": ["first", "second", "third"], + "$.a": [1, 2, 3], + "$.b": [10.0, 20.0, 30.0], + "$.c": [None, "hi second", "hi third"], + } + ), + ) + + def _assert_atom_numbers_match(expected: rdkit.Mol, actual: rdkit.Mol) -> None: expected.UpdatePropertyCache() actual.UpdatePropertyCache()