From 21737615e68fe68209f43398c372f8dae74e2dc4 Mon Sep 17 00:00:00 2001
From: Lukas Turcani <lukasturcani93@gmail.com>
Date: Wed, 28 Feb 2024 22:12:49 +0000
Subject: [PATCH] Support data frames (#62)

---
 docs/source/index.rst              |  47 ++++++++++
 pyproject.toml                     |   3 +
 src/atomlite/_internal/database.py |  67 ++++++++++++++
 tests/test_database.py             | 138 +++++++++++++++++++++++++++++
 4 files changed, 255 insertions(+)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 0527b01..de367c7 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -162,6 +162,53 @@ And retrieve them:
   * :meth:`.Database.get_entries`: For additional documentation.
   * :meth:`.Entry.from_rdkit`: For additional documentation.
 
+Retrieving molecular properties as a DataFrame
+..............................................
+
+.. testsetup:: retrieving_properties
+
+  import atomlite
+  db = atomlite.Database(":memory:")
+  import rdkit.Chem as rdkit
+
+We can retrieve the properties of molecules as a DataFrame:
+
+.. testcode:: retrieving_properties
+
+  db.add_entries(
+      [
+          atomlite.Entry.from_rdkit(
+              key="first",
+              molecule=rdkit.MolFromSmiles("C"),
+              properties={"num_atoms": 1, "is_interesting": False},
+          ),
+          atomlite.Entry.from_rdkit(
+              key="second",
+              molecule=rdkit.MolFromSmiles("CN"),
+              properties={"num_atoms": 2, "is_interesting": True},
+          ),
+      ]
+  )
+  print(db.get_property_df(["$.num_atoms", "$.is_interesting"]))
+
+.. testoutput:: retrieving_properties
+
+  shape: (2, 3)
+  ┌────────┬─────────────┬──────────────────┐
+  │ key    ┆ $.num_atoms ┆ $.is_interesting │
+  │ ---    ┆ ---         ┆ ---              │
+  │ str    ┆ i64         ┆ bool             │
+  ╞════════╪═════════════╪══════════════════╡
+  │ first  ┆ 1           ┆ false            │
+  │ second ┆ 2           ┆ true             │
+  └────────┴─────────────┴──────────────────┘
+
+.. seealso::
+
+  * :meth:`.Database.get_property_df`: For additional documentation.
+  * `Valid property paths`_: For a description of the syntax used to
+    retrieve properties.
+
 Updating molecular properties
 .............................
 
diff --git a/pyproject.toml b/pyproject.toml
index b6252b8..01a675f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ dev = [
   "ruff",
   "mypy",
   "numpy",
+  "polars",
   "pytest",
   "pytest-cov",
   "sphinx",
@@ -38,6 +39,8 @@ documentation = "https://atomlite.readthedocs.io"
 
 [tool.ruff]
 line-length = 79
+
+[tool.ruff.lint]
 select = ["ALL"]
 ignore = ["ANN101", "COM812", "ISC001"]
 
diff --git a/src/atomlite/_internal/database.py b/src/atomlite/_internal/database.py
index 424cdb5..8feddce 100644
--- a/src/atomlite/_internal/database.py
+++ b/src/atomlite/_internal/database.py
@@ -3,8 +3,10 @@
 import pathlib
 import sqlite3
 import typing
+from collections import defaultdict
 from dataclasses import dataclass, field
 
+import polars as pl
 import rdkit.Chem as rdkit  # noqa: N813
 
 from atomlite._internal.json import Json, Molecule, json_from_rdkit
@@ -298,6 +300,63 @@ def has_property_entry(self, key: str) -> bool:
             == 1
         )
 
+    def get_property_df(
+        self,
+        properties: collections.abc.Sequence[str],
+        *,
+        allow_missing: bool = False,
+    ) -> pl.DataFrame:
+        """Get a DataFrame of the properties in the database.
+
+        Parameters:
+            properties:
+                The paths of the properties to retrieve.
+                Valid paths are described
+                `here <https://www.sqlite.org/json1.html#path_arguments>`_.
+                You can also view various code
+                :ref:`examples<examples-valid-property-paths>`
+                in our docs.
+            allow_missing:
+                If ``True``, rows with some missing properties will be
+                included in the DataFrame and hold ``null`` values.
+
+        Returns:
+            A DataFrame of the property entries in the database.
+        """
+        columns = []
+        params = []
+        wheres = []
+        for i, prop in enumerate(properties):
+            columns.append(
+                f"json_extract(properties,?) AS prop{i},"
+                f"json_type(properties,?) AS type{i}"
+            )
+            params.append(prop)
+            params.append(prop)
+            wheres.append(f"prop{i} IS NOT NULL")
+
+        select = ",".join(columns)
+        where = " OR ".join(wheres) if allow_missing else " AND ".join(wheres)
+
+        data = defaultdict(list)
+        for key, *property_results in self.connection.execute(
+            f"SELECT key,{select} "  # noqa: S608
+            f"FROM {self._molecule_table} "
+            f"WHERE {where}",
+            params,
+        ):
+            data["key"].append(key)
+            for prop_name, prop_value, prop_type in _iter_props(
+                properties, property_results
+            ):
+                if prop_type in {"object", "array"}:
+                    data[prop_name].append(json.loads(prop_value))
+                elif prop_type in {"true", "false"}:
+                    data[prop_name].append(bool(prop_value))
+                else:
+                    data[prop_name].append(prop_value)
+        return pl.DataFrame(data)
+
     def get_entry(self, key: str) -> Entry | None:
         """Get a molecular entry from the database.
 
@@ -874,3 +933,11 @@ def update_properties(
             )
         if commit:
             self.connection.commit()
+
+
+def _iter_props(
+    prop_names: collections.abc.Sequence[str],
+    props: collections.abc.Sequence[typing.Any],
+) -> collections.abc.Iterator[tuple[str, typing.Any, str]]:
+    for name, i in zip(prop_names, range(0, len(props), 2), strict=True):
+        yield name, props[i], props[i + 1]
diff --git a/tests/test_database.py b/tests/test_database.py
index ffed048..25d8b0c 100644
--- a/tests/test_database.py
+++ b/tests/test_database.py
@@ -4,6 +4,8 @@
 
 import atomlite
 import numpy as np
+import polars as pl
+import polars.testing as pl_testing
 import pytest
 import rdkit.Chem.AllChem as rdkit  # noqa: N813
 
@@ -501,6 +503,142 @@ def _assert_conformers_match(expected: rdkit.Mol, actual: rdkit.Mol) -> None:
         )
 
 
+def test_get_property_df_and() -> None:
+    db = atomlite.Database(":memory:")
+    db.add_entries(
+        [
+            atomlite.Entry.from_rdkit(
+                "first",
+                rdkit.MolFromSmiles("C"),
+                {"a": 1, "b": 10.0},
+            ),
+            atomlite.Entry.from_rdkit(
+                "second",
+                rdkit.MolFromSmiles("CC"),
+                {"a": 2, "b": 20.0, "c": "hi second"},
+            ),
+            atomlite.Entry.from_rdkit(
+                "third",
+                rdkit.MolFromSmiles("CCC"),
+                {"a": 3, "b": 30.0, "c": "hi third", "d": [1, 2, 3]},
+            ),
+            atomlite.Entry.from_rdkit(
+                "fourth",
+                rdkit.MolFromSmiles("CCCC"),
+                {"a": 4, "b": 40.0, "e": {"a": 12, "b": 24}},
+            ),
+            atomlite.Entry.from_rdkit(
+                "five",
+                rdkit.MolFromSmiles("CCCC"),
+                {"a": 4, "b": 40.0, "f": True},
+            ),
+            atomlite.Entry.from_rdkit(
+                "six",
+                rdkit.MolFromSmiles("CCCC"),
+                {"a": 4, "b": 40.0, "f": False},
+            ),
+        ]
+    )
+    pl_testing.assert_frame_equal(
+        db.get_property_df(["$.a", "$.b"]),
+        pl.DataFrame(
+            {
+                "key": ["first", "second", "third", "fourth", "five", "six"],
+                "$.a": [1, 2, 3, 4, 4, 4],
+                "$.b": [10.0, 20.0, 30.0, 40.0, 40.0, 40.0],
+            }
+        ),
+    )
+    pl_testing.assert_frame_equal(
+        db.get_property_df(["$.a", "$.b", "$.c"]),
+        pl.DataFrame(
+            {
+                "key": ["second", "third"],
+                "$.a": [2, 3],
+                "$.b": [20.0, 30.0],
+                "$.c": ["hi second", "hi third"],
+            }
+        ),
+    )
+    pl_testing.assert_frame_equal(
+        db.get_property_df(["$.a", "$.b", "$.e"]),
+        pl.DataFrame(
+            {
+                "key": ["fourth"],
+                "$.a": [4],
+                "$.b": [40.0],
+                "$.e": [{"a": 12, "b": 24}],
+            }
+        ),
+    )
+    pl_testing.assert_frame_equal(
+        db.get_property_df(["$.a", "$.b", "$.d"]),
+        pl.DataFrame(
+            {
+                "key": ["third"],
+                "$.a": [3],
+                "$.b": [30.0],
+                "$.d": [[1, 2, 3]],
+            }
+        ),
+    )
+    pl_testing.assert_frame_equal(
+        db.get_property_df(["$.a", "$.b", "$.f"]),
+        pl.DataFrame(
+            {
+                "key": ["five", "six"],
+                "$.a": [4, 4],
+                "$.b": [40.0, 40.0],
+                "$.f": [True, False],
+            }
+        ),
+    )
+
+
+def test_get_property_df_or() -> None:
+    db = atomlite.Database(":memory:")
+    db.add_entries(
+        [
+            atomlite.Entry.from_rdkit(
+                "first",
+                rdkit.MolFromSmiles("C"),
+                {"a": 1, "b": 10.0},
+            ),
+            atomlite.Entry.from_rdkit(
+                "second",
+                rdkit.MolFromSmiles("CC"),
+                {"a": 2, "b": 20.0, "c": "hi second"},
+            ),
+            atomlite.Entry.from_rdkit(
+                "third",
+                rdkit.MolFromSmiles("CCC"),
+                {"a": 3, "b": 30.0, "c": "hi third"},
+            ),
+        ]
+    )
+    pl_testing.assert_frame_equal(
+        db.get_property_df(["$.a", "$.b"], allow_missing=True),
+        pl.DataFrame(
+            {
+                "key": ["first", "second", "third"],
+                "$.a": [1, 2, 3],
+                "$.b": [10.0, 20.0, 30.0],
+            }
+        ),
+    )
+    pl_testing.assert_frame_equal(
+        db.get_property_df(["$.a", "$.b", "$.c"], allow_missing=True),
+        pl.DataFrame(
+            {
+                "key": ["first", "second", "third"],
+                "$.a": [1, 2, 3],
+                "$.b": [10.0, 20.0, 30.0],
+                "$.c": [None, "hi second", "hi third"],
+            }
+        ),
+    )
+
+
 def _assert_atom_numbers_match(expected: rdkit.Mol, actual: rdkit.Mol) -> None:
     expected.UpdatePropertyCache()
     actual.UpdatePropertyCache()