Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support data frames #62

Merged
merged 4 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,53 @@ And retrieve them:
* :meth:`.Database.get_entries`: For additional documentation.
* :meth:`.Entry.from_rdkit`: For additional documentation.

Retrieving molecular properties as a DataFrame
..............................................

.. testsetup:: retrieving_properties

import atomlite
db = atomlite.Database(":memory:")
import rdkit.Chem as rdkit

We can retrieve the properties of molecules as a DataFrame:

.. testcode:: retrieving_properties

db.add_entries(
[
atomlite.Entry.from_rdkit(
key="first",
molecule=rdkit.MolFromSmiles("C"),
properties={"num_atoms": 1, "is_interesting": False},
),
atomlite.Entry.from_rdkit(
key="second",
molecule=rdkit.MolFromSmiles("CN"),
properties={"num_atoms": 2, "is_interesting": True},
),
]
)
print(db.get_property_df(["$.num_atoms", "$.is_interesting"]))

.. testoutput:: retrieving_properties

shape: (2, 3)
┌────────┬─────────────┬──────────────────┐
│ key ┆ $.num_atoms ┆ $.is_interesting │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ bool │
╞════════╪═════════════╪══════════════════╡
│ first ┆ 1 ┆ false │
│ second ┆ 2 ┆ true │
└────────┴─────────────┴──────────────────┘

.. seealso::

* :meth:`.Database.get_property_df`: For additional documentation.
* `Valid property paths`_: For a description of the syntax used to
retrieve properties.

Updating molecular properties
.............................

Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ dev = [
"ruff",
"mypy",
"numpy",
"polars",
"pytest",
"pytest-cov",
"sphinx",
Expand All @@ -38,6 +39,8 @@ documentation = "https://atomlite.readthedocs.io"

[tool.ruff]
line-length = 79

[tool.ruff.lint]
select = ["ALL"]
ignore = ["ANN101", "COM812", "ISC001"]

Expand Down
67 changes: 67 additions & 0 deletions src/atomlite/_internal/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import pathlib
import sqlite3
import typing
from collections import defaultdict
from dataclasses import dataclass, field

import polars as pl
import rdkit.Chem as rdkit # noqa: N813

from atomlite._internal.json import Json, Molecule, json_from_rdkit
Expand Down Expand Up @@ -298,6 +300,63 @@ def has_property_entry(self, key: str) -> bool:
== 1
)

def get_property_df(
self,
properties: collections.abc.Sequence[str],
*,
allow_missing: bool = False,
) -> pl.DataFrame:
"""Get a DataFrame of the properties in the database.

Parameters:
properties:
The paths of the properties to retrieve.
Valid paths are described
`here <https://www.sqlite.org/json1.html#path_arguments>`_.
You can also view various code
:ref:`examples<examples-valid-property-paths>`
in our docs.
allow_missing:
If ``True``, rows with some missing properties will be
included in the DataFrame and hold ``null`` values.

Returns:
A DataFrame of the property entries in the database.
"""
columns = []
params = []
wheres = []
for i, prop in enumerate(properties):
columns.append(
f"json_extract(properties,?) AS prop{i},"
f"json_type(properties,?) AS type{i}"
)
params.append(prop)
params.append(prop)
wheres.append(f"prop{i} IS NOT NULL")

select = ",".join(columns)
where = " OR ".join(wheres) if allow_missing else " AND ".join(wheres)

data = defaultdict(list)
for key, *property_results in self.connection.execute(
f"SELECT key,{select} " # noqa: S608
f"FROM {self._molecule_table} "
f"WHERE {where}",
params,
):
data["key"].append(key)
for prop_name, prop_value, prop_type in _iter_props(
properties, property_results
):
if prop_type in {"object", "array"}:
data[prop_name].append(json.loads(prop_value))
elif prop_type in {"true", "false"}:
data[prop_name].append(bool(prop_value))
else:
data[prop_name].append(prop_value)
return pl.DataFrame(data)

def get_entry(self, key: str) -> Entry | None:
"""Get a molecular entry from the database.

Expand Down Expand Up @@ -874,3 +933,11 @@ def update_properties(
)
if commit:
self.connection.commit()


def _iter_props(
prop_names: collections.abc.Sequence[str],
props: collections.abc.Sequence[typing.Any],
) -> collections.abc.Iterator[tuple[str, typing.Any, str]]:
for name, i in zip(prop_names, range(0, len(props), 2), strict=True):
yield name, props[i], props[i + 1]
138 changes: 138 additions & 0 deletions tests/test_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import atomlite
import numpy as np
import polars as pl
import polars.testing as pl_testing
import pytest
import rdkit.Chem.AllChem as rdkit # noqa: N813

Expand Down Expand Up @@ -501,6 +503,142 @@ def _assert_conformers_match(expected: rdkit.Mol, actual: rdkit.Mol) -> None:
)


def test_get_property_df_and() -> None:
db = atomlite.Database(":memory:")
db.add_entries(
[
atomlite.Entry.from_rdkit(
"first",
rdkit.MolFromSmiles("C"),
{"a": 1, "b": 10.0},
),
atomlite.Entry.from_rdkit(
"second",
rdkit.MolFromSmiles("CC"),
{"a": 2, "b": 20.0, "c": "hi second"},
),
atomlite.Entry.from_rdkit(
"third",
rdkit.MolFromSmiles("CCC"),
{"a": 3, "b": 30.0, "c": "hi third", "d": [1, 2, 3]},
),
atomlite.Entry.from_rdkit(
"fourth",
rdkit.MolFromSmiles("CCCC"),
{"a": 4, "b": 40.0, "e": {"a": 12, "b": 24}},
),
atomlite.Entry.from_rdkit(
"five",
rdkit.MolFromSmiles("CCCC"),
{"a": 4, "b": 40.0, "f": True},
),
atomlite.Entry.from_rdkit(
"six",
rdkit.MolFromSmiles("CCCC"),
{"a": 4, "b": 40.0, "f": False},
),
]
)
pl_testing.assert_frame_equal(
db.get_property_df(["$.a", "$.b"]),
pl.DataFrame(
{
"key": ["first", "second", "third", "fourth", "five", "six"],
"$.a": [1, 2, 3, 4, 4, 4],
"$.b": [10.0, 20.0, 30.0, 40.0, 40.0, 40.0],
}
),
)
pl_testing.assert_frame_equal(
db.get_property_df(["$.a", "$.b", "$.c"]),
pl.DataFrame(
{
"key": ["second", "third"],
"$.a": [2, 3],
"$.b": [20.0, 30.0],
"$.c": ["hi second", "hi third"],
}
),
)
pl_testing.assert_frame_equal(
db.get_property_df(["$.a", "$.b", "$.e"]),
pl.DataFrame(
{
"key": ["fourth"],
"$.a": [4],
"$.b": [40.0],
"$.e": [{"a": 12, "b": 24}],
}
),
)
pl_testing.assert_frame_equal(
db.get_property_df(["$.a", "$.b", "$.d"]),
pl.DataFrame(
{
"key": ["third"],
"$.a": [3],
"$.b": [30.0],
"$.d": [[1, 2, 3]],
}
),
)
pl_testing.assert_frame_equal(
db.get_property_df(["$.a", "$.b", "$.f"]),
pl.DataFrame(
{
"key": ["five", "six"],
"$.a": [4, 4],
"$.b": [40.0, 40.0],
"$.f": [True, False],
}
),
)


def test_get_property_df_or() -> None:
db = atomlite.Database(":memory:")
db.add_entries(
[
atomlite.Entry.from_rdkit(
"first",
rdkit.MolFromSmiles("C"),
{"a": 1, "b": 10.0},
),
atomlite.Entry.from_rdkit(
"second",
rdkit.MolFromSmiles("CC"),
{"a": 2, "b": 20.0, "c": "hi second"},
),
atomlite.Entry.from_rdkit(
"third",
rdkit.MolFromSmiles("CCC"),
{"a": 3, "b": 30.0, "c": "hi third"},
),
]
)
pl_testing.assert_frame_equal(
db.get_property_df(["$.a", "$.b"], allow_missing=True),
pl.DataFrame(
{
"key": ["first", "second", "third"],
"$.a": [1, 2, 3],
"$.b": [10.0, 20.0, 30.0],
}
),
)
pl_testing.assert_frame_equal(
db.get_property_df(["$.a", "$.b", "$.c"], allow_missing=True),
pl.DataFrame(
{
"key": ["first", "second", "third"],
"$.a": [1, 2, 3],
"$.b": [10.0, 20.0, 30.0],
"$.c": [None, "hi second", "hi third"],
}
),
)


def _assert_atom_numbers_match(expected: rdkit.Mol, actual: rdkit.Mol) -> None:
expected.UpdatePropertyCache()
actual.UpdatePropertyCache()
Expand Down
Loading