Skip to content

Commit

Permalink
Support data frames (#62)
Browse files Browse the repository at this point in the history
  • Loading branch information
lukasturcani authored Feb 28, 2024
1 parent 1167276 commit 2173761
Show file tree
Hide file tree
Showing 4 changed files with 255 additions and 0 deletions.
47 changes: 47 additions & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,53 @@ And retrieve them:
* :meth:`.Database.get_entries`: For additional documentation.
* :meth:`.Entry.from_rdkit`: For additional documentation.

Retrieving molecular properties as a DataFrame
..............................................

.. testsetup:: retrieving_properties

import atomlite
db = atomlite.Database(":memory:")
import rdkit.Chem as rdkit

We can retrieve the properties of molecules as a DataFrame:

.. testcode:: retrieving_properties

db.add_entries(
[
atomlite.Entry.from_rdkit(
key="first",
molecule=rdkit.MolFromSmiles("C"),
properties={"num_atoms": 1, "is_interesting": False},
),
atomlite.Entry.from_rdkit(
key="second",
molecule=rdkit.MolFromSmiles("CN"),
properties={"num_atoms": 2, "is_interesting": True},
),
]
)
print(db.get_property_df(["$.num_atoms", "$.is_interesting"]))

.. testoutput:: retrieving_properties

shape: (2, 3)
┌────────┬─────────────┬──────────────────┐
│ key ┆ $.num_atoms ┆ $.is_interesting │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ bool │
╞════════╪═════════════╪══════════════════╡
│ first ┆ 1 ┆ false │
│ second ┆ 2 ┆ true │
└────────┴─────────────┴──────────────────┘

.. seealso::

* :meth:`.Database.get_property_df`: For additional documentation.
* `Valid property paths`_: For a description of the syntax used to
retrieve properties.

Updating molecular properties
.............................

Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dev = [
"ruff",
"mypy",
"numpy",
"polars",
"pytest",
"pytest-cov",
"sphinx",
Expand All @@ -38,6 +39,8 @@ documentation = "https://atomlite.readthedocs.io"

[tool.ruff]
line-length = 79

[tool.ruff.lint]
select = ["ALL"]
ignore = ["ANN101", "COM812", "ISC001"]

Expand Down
67 changes: 67 additions & 0 deletions src/atomlite/_internal/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import pathlib
import sqlite3
import typing
from collections import defaultdict
from dataclasses import dataclass, field

import polars as pl
import rdkit.Chem as rdkit # noqa: N813

from atomlite._internal.json import Json, Molecule, json_from_rdkit
Expand Down Expand Up @@ -298,6 +300,63 @@ def has_property_entry(self, key: str) -> bool:
== 1
)

def get_property_df(
self,
properties: collections.abc.Sequence[str],
*,
allow_missing: bool = False,
) -> pl.DataFrame:
"""Get a DataFrame of the properties in the database.
Parameters:
properties:
The paths of the properties to retrieve.
Valid paths are described
`here <https://www.sqlite.org/json1.html#path_arguments>`_.
You can also view various code
:ref:`examples<examples-valid-property-paths>`
in our docs.
allow_missing:
If ``True``, rows with some missing properties will be
included in the DataFrame and hold ``null`` values.
Returns:
A DataFrame of the property entries in the database.
"""
columns = []
params = []
wheres = []
for i, prop in enumerate(properties):
columns.append(
f"json_extract(properties,?) AS prop{i},"
f"json_type(properties,?) AS type{i}"
)
params.append(prop)
params.append(prop)
wheres.append(f"prop{i} IS NOT NULL")

select = ",".join(columns)
where = " OR ".join(wheres) if allow_missing else " AND ".join(wheres)

data = defaultdict(list)
for key, *property_results in self.connection.execute(
f"SELECT key,{select} " # noqa: S608
f"FROM {self._molecule_table} "
f"WHERE {where}",
params,
):
data["key"].append(key)
for prop_name, prop_value, prop_type in _iter_props(
properties, property_results
):
if prop_type in {"object", "array"}:
data[prop_name].append(json.loads(prop_value))
elif prop_type in {"true", "false"}:
data[prop_name].append(bool(prop_value))
else:
data[prop_name].append(prop_value)
return pl.DataFrame(data)

def get_entry(self, key: str) -> Entry | None:
"""Get a molecular entry from the database.
Expand Down Expand Up @@ -874,3 +933,11 @@ def update_properties(
)
if commit:
self.connection.commit()


def _iter_props(
prop_names: collections.abc.Sequence[str],
props: collections.abc.Sequence[typing.Any],
) -> collections.abc.Iterator[tuple[str, typing.Any, str]]:
for name, i in zip(prop_names, range(0, len(props), 2), strict=True):
yield name, props[i], props[i + 1]
138 changes: 138 additions & 0 deletions tests/test_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import atomlite
import numpy as np
import polars as pl
import polars.testing as pl_testing
import pytest
import rdkit.Chem.AllChem as rdkit # noqa: N813

Expand Down Expand Up @@ -501,6 +503,142 @@ def _assert_conformers_match(expected: rdkit.Mol, actual: rdkit.Mol) -> None:
)


def test_get_property_df_and() -> None:
db = atomlite.Database(":memory:")
db.add_entries(
[
atomlite.Entry.from_rdkit(
"first",
rdkit.MolFromSmiles("C"),
{"a": 1, "b": 10.0},
),
atomlite.Entry.from_rdkit(
"second",
rdkit.MolFromSmiles("CC"),
{"a": 2, "b": 20.0, "c": "hi second"},
),
atomlite.Entry.from_rdkit(
"third",
rdkit.MolFromSmiles("CCC"),
{"a": 3, "b": 30.0, "c": "hi third", "d": [1, 2, 3]},
),
atomlite.Entry.from_rdkit(
"fourth",
rdkit.MolFromSmiles("CCCC"),
{"a": 4, "b": 40.0, "e": {"a": 12, "b": 24}},
),
atomlite.Entry.from_rdkit(
"five",
rdkit.MolFromSmiles("CCCC"),
{"a": 4, "b": 40.0, "f": True},
),
atomlite.Entry.from_rdkit(
"six",
rdkit.MolFromSmiles("CCCC"),
{"a": 4, "b": 40.0, "f": False},
),
]
)
pl_testing.assert_frame_equal(
db.get_property_df(["$.a", "$.b"]),
pl.DataFrame(
{
"key": ["first", "second", "third", "fourth", "five", "six"],
"$.a": [1, 2, 3, 4, 4, 4],
"$.b": [10.0, 20.0, 30.0, 40.0, 40.0, 40.0],
}
),
)
pl_testing.assert_frame_equal(
db.get_property_df(["$.a", "$.b", "$.c"]),
pl.DataFrame(
{
"key": ["second", "third"],
"$.a": [2, 3],
"$.b": [20.0, 30.0],
"$.c": ["hi second", "hi third"],
}
),
)
pl_testing.assert_frame_equal(
db.get_property_df(["$.a", "$.b", "$.e"]),
pl.DataFrame(
{
"key": ["fourth"],
"$.a": [4],
"$.b": [40.0],
"$.e": [{"a": 12, "b": 24}],
}
),
)
pl_testing.assert_frame_equal(
db.get_property_df(["$.a", "$.b", "$.d"]),
pl.DataFrame(
{
"key": ["third"],
"$.a": [3],
"$.b": [30.0],
"$.d": [[1, 2, 3]],
}
),
)
pl_testing.assert_frame_equal(
db.get_property_df(["$.a", "$.b", "$.f"]),
pl.DataFrame(
{
"key": ["five", "six"],
"$.a": [4, 4],
"$.b": [40.0, 40.0],
"$.f": [True, False],
}
),
)


def test_get_property_df_or() -> None:
db = atomlite.Database(":memory:")
db.add_entries(
[
atomlite.Entry.from_rdkit(
"first",
rdkit.MolFromSmiles("C"),
{"a": 1, "b": 10.0},
),
atomlite.Entry.from_rdkit(
"second",
rdkit.MolFromSmiles("CC"),
{"a": 2, "b": 20.0, "c": "hi second"},
),
atomlite.Entry.from_rdkit(
"third",
rdkit.MolFromSmiles("CCC"),
{"a": 3, "b": 30.0, "c": "hi third"},
),
]
)
pl_testing.assert_frame_equal(
db.get_property_df(["$.a", "$.b"], allow_missing=True),
pl.DataFrame(
{
"key": ["first", "second", "third"],
"$.a": [1, 2, 3],
"$.b": [10.0, 20.0, 30.0],
}
),
)
pl_testing.assert_frame_equal(
db.get_property_df(["$.a", "$.b", "$.c"], allow_missing=True),
pl.DataFrame(
{
"key": ["first", "second", "third"],
"$.a": [1, 2, 3],
"$.b": [10.0, 20.0, 30.0],
"$.c": [None, "hi second", "hi third"],
}
),
)


def _assert_atom_numbers_match(expected: rdkit.Mol, actual: rdkit.Mol) -> None:
expected.UpdatePropertyCache()
actual.UpdatePropertyCache()
Expand Down

0 comments on commit 2173761

Please sign in to comment.