From a05df4d36d589088d4050d42022cc9c6ca018616 Mon Sep 17 00:00:00 2001 From: Tim Kucera Date: Tue, 28 Nov 2023 20:08:04 +0100 Subject: [PATCH] intermediate commit --- .../adapters/__baseclass__.py} | 0 proteinshake/backend/collection.py | 25 +++++++++ proteinshake/backend/database.py | 13 +++++ proteinshake/backend/protein.py | 4 ++ proteinshake/backend/structure.py | 4 ++ proteinshake/{tasks => frontend}/README.md | 4 ++ proteinshake/frontend/dataset.py | 54 +++++++++++++++++++ proteinshake/frontend/datasets/dataset.py | 0 .../frontend/evaluators/classification.py | 4 ++ .../evaluators}/evaluator.py | 0 proteinshake/frontend/protein.py | 4 ++ proteinshake/frontend/splitters/attribute.py | 14 +++++ .../frontend/splitters/from_existing.py | 0 .../frontend/splitters/pairwise_attribute.py | 17 ++++++ .../frontend/splitters/sequence_similarity.py | 0 .../{tasks => frontend/splitters}/splitter.py | 6 ++- .../splitters/structure_similarity.py | 0 .../targets}/pairwise_property_target.py | 0 .../targets}/property_target.py | 0 .../{tasks => frontend/targets}/target.py | 0 proteinshake/frontend/task.py | 30 +++++++++++ .../frontend/tasks/pairwise_protein.py | 0 .../frontend/tasks/pairwise_residue.py | 0 proteinshake/frontend/tasks/protein.py | 0 proteinshake/frontend/tasks/residue.py | 0 proteinshake/frontend/tasks/task.py | 0 .../transforms/framework/torch.py | 0 .../transforms/post_framework/note.md | 0 .../post_representation/add_node_degree.py | 0 .../pre_framework/random_residue_masking.py | 0 .../pre_representation/residue_level.py | 0 .../transforms/representation/point.py | 0 proteinshake/tasks/attribute_splitter.py | 16 ------ .../tasks/classification_evaluator.py | 4 -- .../tasks/pairwise_attribute_splitter.py | 21 -------- proteinshake/tasks/task.py | 29 ---------- proteinshake/tasks/time_splitter.py | 15 ------ 37 files changed, 177 insertions(+), 87 deletions(-) rename proteinshake/{frontend/dataloader.py => backend/adapters/__baseclass__.py} (100%) create mode 100644 proteinshake/backend/structure.py rename proteinshake/{tasks => frontend}/README.md (98%) create mode 100644 proteinshake/frontend/dataset.py delete mode 100644 proteinshake/frontend/datasets/dataset.py rename proteinshake/{tasks => frontend/evaluators}/evaluator.py (100%) create mode 100644 proteinshake/frontend/protein.py create mode 100644 proteinshake/frontend/splitters/attribute.py delete mode 100644 proteinshake/frontend/splitters/from_existing.py create mode 100644 proteinshake/frontend/splitters/pairwise_attribute.py delete mode 100644 proteinshake/frontend/splitters/sequence_similarity.py rename proteinshake/{tasks => frontend/splitters}/splitter.py (59%) delete mode 100644 proteinshake/frontend/splitters/structure_similarity.py rename proteinshake/{tasks => frontend/targets}/pairwise_property_target.py (100%) rename proteinshake/{tasks => frontend/targets}/property_target.py (100%) rename proteinshake/{tasks => frontend/targets}/target.py (100%) create mode 100644 proteinshake/frontend/task.py delete mode 100644 proteinshake/frontend/tasks/pairwise_protein.py delete mode 100644 proteinshake/frontend/tasks/pairwise_residue.py delete mode 100644 proteinshake/frontend/tasks/protein.py delete mode 100644 proteinshake/frontend/tasks/residue.py delete mode 100644 proteinshake/frontend/tasks/task.py rename proteinshake/{ => frontend}/transforms/framework/torch.py (100%) rename proteinshake/{ => frontend}/transforms/post_framework/note.md (100%) rename proteinshake/{ => frontend}/transforms/post_representation/add_node_degree.py (100%) rename proteinshake/{ => frontend}/transforms/pre_framework/random_residue_masking.py (100%) rename proteinshake/{ => frontend}/transforms/pre_representation/residue_level.py (100%) rename proteinshake/{ => frontend}/transforms/representation/point.py (100%) delete mode 100644 proteinshake/tasks/attribute_splitter.py delete mode 100644 proteinshake/tasks/classification_evaluator.py delete mode 100644 proteinshake/tasks/pairwise_attribute_splitter.py delete mode 100644 proteinshake/tasks/task.py delete mode 100644 proteinshake/tasks/time_splitter.py diff --git a/proteinshake/frontend/dataloader.py b/proteinshake/backend/adapters/__baseclass__.py similarity index 100% rename from proteinshake/frontend/dataloader.py rename to proteinshake/backend/adapters/__baseclass__.py diff --git a/proteinshake/backend/collection.py b/proteinshake/backend/collection.py index e69de29b..42d4d2e3 100644 --- a/proteinshake/backend/collection.py +++ b/proteinshake/backend/collection.py @@ -0,0 +1,25 @@ +class Collection: + """ + Holds a set of proteins as the result of a database query and prepares it for dataset creation. + """ + + def __init__(self, proteins: list[dict]) -> None: + pass + + def add(self, metadata: Any) -> None: + """ + Adds any kind of metadata to the collection, such as split indices. + """ + pass + + def save(self, name: str) -> None: + """ + Saves the proteins and meta data in compressed format. + """ + pass + + def upload(self, version: str = None) -> None: + """ + Uploads the collection and meta data to Zenodo. `version` defaults to the current date. + """ + pass diff --git a/proteinshake/backend/database.py b/proteinshake/backend/database.py index e69de29b..0021b8e9 100644 --- a/proteinshake/backend/database.py +++ b/proteinshake/backend/database.py @@ -0,0 +1,13 @@ +from pathlib import Path +from .collection import Collection + + +class Database: + def __init__(self, storage: Path) -> None: + pass + + def update(self) -> None: + pass + + def query(self, query: str) -> Collection: + pass diff --git a/proteinshake/backend/protein.py b/proteinshake/backend/protein.py index e69de29b..bc150cb6 100644 --- a/proteinshake/backend/protein.py +++ b/proteinshake/backend/protein.py @@ -0,0 +1,4 @@ +class Protein: + """ + Takes a result row from a database query and converts it to a dictionary. + """ diff --git a/proteinshake/backend/structure.py b/proteinshake/backend/structure.py new file mode 100644 index 00000000..19aee811 --- /dev/null +++ b/proteinshake/backend/structure.py @@ -0,0 +1,4 @@ +class Structure: + """ + Takes a pdb/mmcif file and converts it to a compressed data format. + """ diff --git a/proteinshake/tasks/README.md b/proteinshake/frontend/README.md similarity index 98% rename from proteinshake/tasks/README.md rename to proteinshake/frontend/README.md index 34e66bad..ae992b8c 100644 --- a/proteinshake/tasks/README.md +++ b/proteinshake/frontend/README.md @@ -1,3 +1,7 @@ +## Dataset API + +tba + ## Task API A task brings three objects: `Splitter`, `Target`, `Evaluator` to a given `proteinshake.Dataset` instance. diff --git a/proteinshake/frontend/dataset.py b/proteinshake/frontend/dataset.py new file mode 100644 index 00000000..81ba5f14 --- /dev/null +++ b/proteinshake/frontend/dataset.py @@ -0,0 +1,54 @@ +class Dataset: + def __init__( + self, + path: Path, + version: str = "latest", + shard_size: int = None, + batch_size: int = None, + shuffle: bool = False, + random_seed: int = 42, + ) -> None: + """ + Takes a compressed collection and applies transforms. + `path` is either pointing to a Zenodo repository or a directory in the local filesystem. + """ + pass + + def to_graph( + self, + pre_transform: PreRepresentationTransform = None, + post_transform: PostRepresentationTransform = None, + **kwargs + ) -> Dataset: + """ + Applies pre/representation/post transforms to all proteins in the dataset. + """ + self.proteins.apply(pre_transform) + self.proteins.apply(GraphTransform(**kwargs)) + self.proteins.apply(post_transform) + return self + + def pyg( + self, + pre_transform: PreFrameworkTransform = None, + post_transform: PostFrameworkTransform = None, + **kwargs + ) -> Generic: + """ + Creates an iterable that wraps around __next__ or __getitem__ and applies pre/framework/post transforms. + Returns a framework-specific dataset instance (iterable-style if sharded, map-style if in-memory or on-disk). + """ + pass + + def __next__(self) -> None: + """ + Yields the next protein from a shard. When the shard is finished, loads the next one. + If `shuffle` is True, loads a random shard and applies shuffling within the shard. + """ + pass + + def __getitem__(self, index: Union[int, list, tuple, ndarray]) -> None: + """ + Returns the indexed proteins. Not available with sharding for performance reasons. + """ + pass diff --git a/proteinshake/frontend/datasets/dataset.py b/proteinshake/frontend/datasets/dataset.py deleted file mode 100644 index e69de29b..00000000 diff --git a/proteinshake/frontend/evaluators/classification.py b/proteinshake/frontend/evaluators/classification.py index e69de29b..e6bcb918 100644 --- a/proteinshake/frontend/evaluators/classification.py +++ b/proteinshake/frontend/evaluators/classification.py @@ -0,0 +1,4 @@ +class ClassificationEvaluator(Evaluator): + def __call__(self, pred : list, truth: list): + return {'accuracy': sklearn.accuracy(pred, truth)} + pass diff --git a/proteinshake/tasks/evaluator.py b/proteinshake/frontend/evaluators/evaluator.py similarity index 100% rename from proteinshake/tasks/evaluator.py rename to proteinshake/frontend/evaluators/evaluator.py diff --git a/proteinshake/frontend/protein.py b/proteinshake/frontend/protein.py new file mode 100644 index 00000000..86fc1a52 --- /dev/null +++ b/proteinshake/frontend/protein.py @@ -0,0 +1,4 @@ +class Protein: + """ + Takes a (compressed) collection protein and converts it to an uncompressed protein dictionary. + """ diff --git a/proteinshake/frontend/splitters/attribute.py b/proteinshake/frontend/splitters/attribute.py new file mode 100644 index 00000000..7e729b58 --- /dev/null +++ b/proteinshake/frontend/splitters/attribute.py @@ -0,0 +1,14 @@ +class AttributeSplitter(Splitter): + """ + Compute splits based on an attribute that already exists in the dataset + """ + + def __init__( + self, train_attribute: str, val_attribute: str, test_attribute: str + ) -> None: + self.train_attribute = train_attribute + self.val_attribute = val_attribute + self.test_attribute = test_attribute + + def __call__(self, dataset) -> tuple[list, list, list]: + pass diff --git a/proteinshake/frontend/splitters/from_existing.py b/proteinshake/frontend/splitters/from_existing.py deleted file mode 100644 index e69de29b..00000000 diff --git a/proteinshake/frontend/splitters/pairwise_attribute.py b/proteinshake/frontend/splitters/pairwise_attribute.py new file mode 100644 index 00000000..f90b9258 --- /dev/null +++ b/proteinshake/frontend/splitters/pairwise_attribute.py @@ -0,0 +1,17 @@ +class PairwiseAttributeSplitter(Splitter): + """Compute pairwise splits based on an attribute that already exists in the dataset. + Takes all pairs of train/val/test in the single attribute splitting setting.""" + + def __init__( + self, train_attribute: str, val_attribute: str, test_attribute: str + ) -> None: + self.train_attribute = train_attribute + self.val_attribute = val_attribute + self.test_attribute = test_attribute + + def __call__(self, dataset) -> tuple[list, list, list]: + tmp_splitter = AttributeSplitter( + self.train_attribute, self.val_attribute, self.test_attribute + ) + # compute pairs of indices on the non-paired splits + pass diff --git a/proteinshake/frontend/splitters/sequence_similarity.py b/proteinshake/frontend/splitters/sequence_similarity.py deleted file mode 100644 index e69de29b..00000000 diff --git a/proteinshake/tasks/splitter.py b/proteinshake/frontend/splitters/splitter.py similarity index 59% rename from proteinshake/tasks/splitter.py rename to proteinshake/frontend/splitters/splitter.py index 93988c4a..9560e7a8 100644 --- a/proteinshake/tasks/splitter.py +++ b/proteinshake/frontend/splitters/splitter.py @@ -1,5 +1,7 @@ class Splitter: -""" Abstract class for selecting train/val/test indices given a dataset. -""" + """ + Abstract class for selecting train/val/test indices given a dataset. + """ + def __call__(self, dataset) -> tuple[list, list, list]: raise NotImplementedError diff --git a/proteinshake/frontend/splitters/structure_similarity.py b/proteinshake/frontend/splitters/structure_similarity.py deleted file mode 100644 index e69de29b..00000000 diff --git a/proteinshake/tasks/pairwise_property_target.py b/proteinshake/frontend/targets/pairwise_property_target.py similarity index 100% rename from proteinshake/tasks/pairwise_property_target.py rename to proteinshake/frontend/targets/pairwise_property_target.py diff --git a/proteinshake/tasks/property_target.py b/proteinshake/frontend/targets/property_target.py similarity index 100% rename from proteinshake/tasks/property_target.py rename to proteinshake/frontend/targets/property_target.py diff --git a/proteinshake/tasks/target.py b/proteinshake/frontend/targets/target.py similarity index 100% rename from proteinshake/tasks/target.py rename to proteinshake/frontend/targets/target.py diff --git a/proteinshake/frontend/task.py b/proteinshake/frontend/task.py new file mode 100644 index 00000000..326248f7 --- /dev/null +++ b/proteinshake/frontend/task.py @@ -0,0 +1,30 @@ +class Task: + """ + Abstract class for Tasks. A task contains the logic for splitting, target generation, and evaluation. + Optionally, we can consider the Task as a way of syncing with a paperwithcode instance https://github.com/paperswithcode/paperswithcode-client. + """ + + def __init__( + self, + dataset: proteinshake.Dataset, + splitter: proteinshake.Splitter, + target: proteinshake.Target, + evaluator: proteinshake.Evaluator, + task_id: int, + ) -> None: + self.dataset = dataset + self.train_idx = splitter.train_idx() + self.val_idx = splitter.val_idx() + self.test_idx = splitter.test_idx() + + self.task_id = task_id + + self.target = target + self.evaluator = evaluator + pass + + def leaderboard_fetch(self): + """Load current leaderboard results for this task""" + + if not self.task_id is None: + return get_leaderboard(f"https:/paperswithcode.com/sota/{self.task_id}") diff --git a/proteinshake/frontend/tasks/pairwise_protein.py b/proteinshake/frontend/tasks/pairwise_protein.py deleted file mode 100644 index e69de29b..00000000 diff --git a/proteinshake/frontend/tasks/pairwise_residue.py b/proteinshake/frontend/tasks/pairwise_residue.py deleted file mode 100644 index e69de29b..00000000 diff --git a/proteinshake/frontend/tasks/protein.py b/proteinshake/frontend/tasks/protein.py deleted file mode 100644 index e69de29b..00000000 diff --git a/proteinshake/frontend/tasks/residue.py b/proteinshake/frontend/tasks/residue.py deleted file mode 100644 index e69de29b..00000000 diff --git a/proteinshake/frontend/tasks/task.py b/proteinshake/frontend/tasks/task.py deleted file mode 100644 index e69de29b..00000000 diff --git a/proteinshake/transforms/framework/torch.py b/proteinshake/frontend/transforms/framework/torch.py similarity index 100% rename from proteinshake/transforms/framework/torch.py rename to proteinshake/frontend/transforms/framework/torch.py diff --git a/proteinshake/transforms/post_framework/note.md b/proteinshake/frontend/transforms/post_framework/note.md similarity index 100% rename from proteinshake/transforms/post_framework/note.md rename to proteinshake/frontend/transforms/post_framework/note.md diff --git a/proteinshake/transforms/post_representation/add_node_degree.py b/proteinshake/frontend/transforms/post_representation/add_node_degree.py similarity index 100% rename from proteinshake/transforms/post_representation/add_node_degree.py rename to proteinshake/frontend/transforms/post_representation/add_node_degree.py diff --git a/proteinshake/transforms/pre_framework/random_residue_masking.py b/proteinshake/frontend/transforms/pre_framework/random_residue_masking.py similarity index 100% rename from proteinshake/transforms/pre_framework/random_residue_masking.py rename to proteinshake/frontend/transforms/pre_framework/random_residue_masking.py diff --git a/proteinshake/transforms/pre_representation/residue_level.py b/proteinshake/frontend/transforms/pre_representation/residue_level.py similarity index 100% rename from proteinshake/transforms/pre_representation/residue_level.py rename to proteinshake/frontend/transforms/pre_representation/residue_level.py diff --git a/proteinshake/transforms/representation/point.py b/proteinshake/frontend/transforms/representation/point.py similarity index 100% rename from proteinshake/transforms/representation/point.py rename to proteinshake/frontend/transforms/representation/point.py diff --git a/proteinshake/tasks/attribute_splitter.py b/proteinshake/tasks/attribute_splitter.py deleted file mode 100644 index aa8e3f9d..00000000 --- a/proteinshake/tasks/attribute_splitter.py +++ /dev/null @@ -1,16 +0,0 @@ -from .splitter import Splitter - -class AttributeSplitter(Splitter): - """ Compute splits based on an attribute that already exists in the dataset""" - def __init__(self, - train_attribute: str, - val_attribute: str, - test_attribute: str) -> None: - - self.train_attribute = train_attribute - self.val_attribute = val_attribute - self.test_attribute = test_attribute - - def __call__(self, dataset) -> tuple[list, list, list]: - pass - pass diff --git a/proteinshake/tasks/classification_evaluator.py b/proteinshake/tasks/classification_evaluator.py deleted file mode 100644 index e6bcb918..00000000 --- a/proteinshake/tasks/classification_evaluator.py +++ /dev/null @@ -1,4 +0,0 @@ -class ClassificationEvaluator(Evaluator): - def __call__(self, pred : list, truth: list): - return {'accuracy': sklearn.accuracy(pred, truth)} - pass diff --git a/proteinshake/tasks/pairwise_attribute_splitter.py b/proteinshake/tasks/pairwise_attribute_splitter.py deleted file mode 100644 index 86762c65..00000000 --- a/proteinshake/tasks/pairwise_attribute_splitter.py +++ /dev/null @@ -1,21 +0,0 @@ -from .splitter import Splitter - -class PairwiseAttributeSplitter(Splitter): - """ Compute pairwise splits based on an attribute that already exists in the dataset. - Takes all pairs of train/val/test in the single attribute splitting setting.""" - def __init__(self, - train_attribute: str, - val_attribute: str, - test_attribute: str) -> None: - - self.train_attribute = train_attribute - self.val_attribute = val_attribute - self.test_attribute = test_attribute - - def __call__(self, dataset) -> tuple[list, list, list]: - tmp_splitter = AttributeSplitter(self.train_attribute, - self.val_attribute, - self.test_attribute) - # compute pairs of indices on the non-paired splits - pass - pass diff --git a/proteinshake/tasks/task.py b/proteinshake/tasks/task.py deleted file mode 100644 index 11456ad6..00000000 --- a/proteinshake/tasks/task.py +++ /dev/null @@ -1,29 +0,0 @@ -""" -Abstract class for Tasks. A task contains the logic for splitting, target generation, and evaluation. -Optionally, we can consider the Task as a way of syncing with a paperwithcode instance https://github.com/paperswithcode/paperswithcode-client. -""" - -class Task: - def __init__(self, dataset: proteinshake.Dataset, - splitter : proteinshake.Splitter, - target: proteinshake.Target, - evaluator: proteinshake.Evaluator, - task_id: int, - ) -> None: - self.dataset = dataset - self.train_idx = splitter.train_idx() - self.val_idx = splitter.val_idx() - self.test_idx = splitter.test_idx() - - self.task_id = task_id - - self.target = target - self.evaluator = evaluator - pass - - def leaderboard_fetch(self): - """ Load current leaderboard results for this task - """ - - if not self.task_id is None: - return get_leaderboard(f"https:/paperswithcode.com/sota/{self.task_id}") diff --git a/proteinshake/tasks/time_splitter.py b/proteinshake/tasks/time_splitter.py deleted file mode 100644 index 3450cefc..00000000 --- a/proteinshake/tasks/time_splitter.py +++ /dev/null @@ -1,15 +0,0 @@ -from .splitter import Splitter - -class TimeSplitter(Splitter): - """ Compute splits based on an structure publication date which should be an attribute - in the Dataset""" - def __init__(self, - train_cutoff: int, - val_cutoff: int) -> None: - - self.train_cutoff = train_cutoff - self.val_cutoff = val_cutoff - - def __call__(self, dataset) -> tuple[list, list, list]: - pass - pass