intermediate commit

BorgwardtLab · Nov 28, 2023 · a05df4d · a05df4d
1 parent bb9b412
commit a05df4d
Show file tree

Hide file tree

Showing 37 changed files with 177 additions and 87 deletions.
diff --git a/proteinshake/frontend/dataloader.py → ...inshake/backend/adapters/__baseclass__.py b/proteinshake/frontend/dataloader.py → ...inshake/backend/adapters/__baseclass__.py
diff --git a/proteinshake/backend/collection.py b/proteinshake/backend/collection.py
@@ -0,0 +1,25 @@
+class Collection:
+    """
+    Holds a set of proteins as the result of a database query and prepares it for dataset creation.
+    """
+
+    def __init__(self, proteins: list[dict]) -> None:
+        pass
+
+    def add(self, metadata: Any) -> None:
+        """
+        Adds any kind of metadata to the collection, such as split indices.
+        """
+        pass
+
+    def save(self, name: str) -> None:
+        """
+        Saves the proteins and meta data in compressed format.
+        """
+        pass
+
+    def upload(self, version: str = None) -> None:
+        """
+        Uploads the collection and meta data to Zenodo. `version` defaults to the current date.
+        """
+        pass
diff --git a/proteinshake/backend/database.py b/proteinshake/backend/database.py
@@ -0,0 +1,13 @@
+from pathlib import Path
+from .collection import Collection
+
+
+class Database:
+    def __init__(self, storage: Path) -> None:
+        pass
+
+    def update(self) -> None:
+        pass
+
+    def query(self, query: str) -> Collection:
+        pass
diff --git a/proteinshake/backend/protein.py b/proteinshake/backend/protein.py
@@ -0,0 +1,4 @@
+class Protein:
+    """
+    Takes a result row from a database query and converts it to a dictionary.
+    """
diff --git a/proteinshake/backend/structure.py b/proteinshake/backend/structure.py
@@ -0,0 +1,4 @@
+class Structure:
+    """
+    Takes a pdb/mmcif file and converts it to a compressed data format.
+    """
diff --git a/proteinshake/tasks/README.md → proteinshake/frontend/README.md b/proteinshake/tasks/README.md → proteinshake/frontend/README.md
@@ -1,3 +1,7 @@
+## Dataset API
+
+tba
+
 ## Task API
 
 A task brings three objects: `Splitter`, `Target`, `Evaluator` to a given `proteinshake.Dataset` instance.

diff --git a/proteinshake/frontend/dataset.py b/proteinshake/frontend/dataset.py
@@ -0,0 +1,54 @@
+class Dataset:
+    def __init__(
+        self,
+        path: Path,
+        version: str = "latest",
+        shard_size: int = None,
+        batch_size: int = None,
+        shuffle: bool = False,
+        random_seed: int = 42,
+    ) -> None:
+        """
+        Takes a compressed collection and applies transforms.
+        `path` is either pointing to a Zenodo repository or a directory in the local filesystem.
+        """
+        pass
+
+    def to_graph(
+        self,
+        pre_transform: PreRepresentationTransform = None,
+        post_transform: PostRepresentationTransform = None,
+        **kwargs
+    ) -> Dataset:
+        """
+        Applies pre/representation/post transforms to all proteins in the dataset.
+        """
+        self.proteins.apply(pre_transform)
+        self.proteins.apply(GraphTransform(**kwargs))
+        self.proteins.apply(post_transform)
+        return self
+
+    def pyg(
+        self,
+        pre_transform: PreFrameworkTransform = None,
+        post_transform: PostFrameworkTransform = None,
+        **kwargs
+    ) -> Generic:
+        """
+        Creates an iterable that wraps around __next__ or __getitem__ and applies pre/framework/post transforms.
+        Returns a framework-specific dataset instance (iterable-style if sharded, map-style if in-memory or on-disk).
+        """
+        pass
+
+    def __next__(self) -> None:
+        """
+        Yields the next protein from a shard. When the shard is finished, loads the next one.
+        If `shuffle` is True, loads a random shard and applies shuffling within the shard.
+        """
+        pass
+
+    def __getitem__(self, index: Union[int, list, tuple, ndarray]) -> None:
+        """
+        Returns the indexed proteins. Not available with sharding for performance reasons.
+        """
+        pass
diff --git a/proteinshake/frontend/datasets/dataset.py b/proteinshake/frontend/datasets/dataset.py
diff --git a/proteinshake/frontend/evaluators/classification.py b/proteinshake/frontend/evaluators/classification.py
@@ -0,0 +1,4 @@
+class ClassificationEvaluator(Evaluator):
+    def __call__(self, pred : list, truth: list):
+        return {'accuracy': sklearn.accuracy(pred, truth)}
+        pass
diff --git a/proteinshake/tasks/evaluator.py → ...einshake/frontend/evaluators/evaluator.py b/proteinshake/tasks/evaluator.py → ...einshake/frontend/evaluators/evaluator.py
diff --git a/proteinshake/frontend/protein.py b/proteinshake/frontend/protein.py
@@ -0,0 +1,4 @@
+class Protein:
+    """
+    Takes a (compressed) collection protein and converts it to an uncompressed protein dictionary.
+    """
diff --git a/proteinshake/frontend/splitters/attribute.py b/proteinshake/frontend/splitters/attribute.py
@@ -0,0 +1,14 @@
+class AttributeSplitter(Splitter):
+    """
+    Compute splits based on an attribute that already exists in the dataset
+    """
+
+    def __init__(
+        self, train_attribute: str, val_attribute: str, test_attribute: str
+    ) -> None:
+        self.train_attribute = train_attribute
+        self.val_attribute = val_attribute
+        self.test_attribute = test_attribute
+
+    def __call__(self, dataset) -> tuple[list, list, list]:
+        pass
diff --git a/proteinshake/frontend/splitters/from_existing.py b/proteinshake/frontend/splitters/from_existing.py
diff --git a/proteinshake/frontend/splitters/pairwise_attribute.py b/proteinshake/frontend/splitters/pairwise_attribute.py
@@ -0,0 +1,17 @@
+class PairwiseAttributeSplitter(Splitter):
+    """Compute pairwise splits based on an attribute that already exists in the dataset.
+    Takes all pairs of train/val/test in the single attribute splitting setting."""
+
+    def __init__(
+        self, train_attribute: str, val_attribute: str, test_attribute: str
+    ) -> None:
+        self.train_attribute = train_attribute
+        self.val_attribute = val_attribute
+        self.test_attribute = test_attribute
+
+    def __call__(self, dataset) -> tuple[list, list, list]:
+        tmp_splitter = AttributeSplitter(
+            self.train_attribute, self.val_attribute, self.test_attribute
+        )
+        # compute pairs of indices on the non-paired splits
+        pass
diff --git a/proteinshake/frontend/splitters/sequence_similarity.py b/proteinshake/frontend/splitters/sequence_similarity.py
diff --git a/proteinshake/tasks/splitter.py → proteinshake/frontend/splitters/splitter.py b/proteinshake/tasks/splitter.py → proteinshake/frontend/splitters/splitter.py
@@ -1,5 +1,7 @@
 class Splitter:
-""" Abstract class for selecting train/val/test indices given a dataset.
-"""
+    """
+    Abstract class for selecting train/val/test indices given a dataset.
+    """
+
     def __call__(self, dataset) -> tuple[list, list, list]:
         raise NotImplementedError
diff --git a/proteinshake/frontend/splitters/structure_similarity.py b/proteinshake/frontend/splitters/structure_similarity.py
diff --git a/...inshake/tasks/pairwise_property_target.py → ...ntend/targets/pairwise_property_target.py b/...inshake/tasks/pairwise_property_target.py → ...ntend/targets/pairwise_property_target.py
diff --git a/proteinshake/tasks/property_target.py → ...shake/frontend/targets/property_target.py b/proteinshake/tasks/property_target.py → ...shake/frontend/targets/property_target.py
diff --git a/proteinshake/tasks/target.py → proteinshake/frontend/targets/target.py b/proteinshake/tasks/target.py → proteinshake/frontend/targets/target.py
diff --git a/proteinshake/frontend/task.py b/proteinshake/frontend/task.py
@@ -0,0 +1,30 @@
+class Task:
+    """
+    Abstract class for Tasks. A task contains the logic for splitting, target generation, and evaluation.
+    Optionally, we can consider the Task as a way of syncing with a paperwithcode instance https://github.com/paperswithcode/paperswithcode-client.
+    """
+
+    def __init__(
+        self,
+        dataset: proteinshake.Dataset,
+        splitter: proteinshake.Splitter,
+        target: proteinshake.Target,
+        evaluator: proteinshake.Evaluator,
+        task_id: int,
+    ) -> None:
+        self.dataset = dataset
+        self.train_idx = splitter.train_idx()
+        self.val_idx = splitter.val_idx()
+        self.test_idx = splitter.test_idx()
+
+        self.task_id = task_id
+
+        self.target = target
+        self.evaluator = evaluator
+        pass
+
+    def leaderboard_fetch(self):
+        """Load current leaderboard results for this task"""
+
+        if not self.task_id is None:
+            return get_leaderboard(f"https:/paperswithcode.com/sota/{self.task_id}")
diff --git a/proteinshake/frontend/tasks/pairwise_protein.py b/proteinshake/frontend/tasks/pairwise_protein.py
diff --git a/proteinshake/frontend/tasks/pairwise_residue.py b/proteinshake/frontend/tasks/pairwise_residue.py
diff --git a/proteinshake/frontend/tasks/protein.py b/proteinshake/frontend/tasks/protein.py
diff --git a/proteinshake/frontend/tasks/residue.py b/proteinshake/frontend/tasks/residue.py
diff --git a/proteinshake/frontend/tasks/task.py b/proteinshake/frontend/tasks/task.py
diff --git a/proteinshake/transforms/framework/torch.py → ...ke/frontend/transforms/framework/torch.py b/proteinshake/transforms/framework/torch.py → ...ke/frontend/transforms/framework/torch.py
diff --git a/...inshake/transforms/post_framework/note.md → ...rontend/transforms/post_framework/note.md b/...inshake/transforms/post_framework/note.md → ...rontend/transforms/post_framework/note.md
diff --git a/...ms/post_representation/add_node_degree.py → ...ms/post_representation/add_node_degree.py b/...ms/post_representation/add_node_degree.py → ...ms/post_representation/add_node_degree.py
diff --git a/...s/pre_framework/random_residue_masking.py → ...s/pre_framework/random_residue_masking.py b/...s/pre_framework/random_residue_masking.py → ...s/pre_framework/random_residue_masking.py
diff --git a/...forms/pre_representation/residue_level.py → ...forms/pre_representation/residue_level.py b/...forms/pre_representation/residue_level.py → ...forms/pre_representation/residue_level.py
diff --git a/...nshake/transforms/representation/point.py → ...ontend/transforms/representation/point.py b/...nshake/transforms/representation/point.py → ...ontend/transforms/representation/point.py
diff --git a/proteinshake/tasks/attribute_splitter.py b/proteinshake/tasks/attribute_splitter.py
diff --git a/proteinshake/tasks/classification_evaluator.py b/proteinshake/tasks/classification_evaluator.py
diff --git a/proteinshake/tasks/pairwise_attribute_splitter.py b/proteinshake/tasks/pairwise_attribute_splitter.py
diff --git a/proteinshake/tasks/task.py b/proteinshake/tasks/task.py
diff --git a/proteinshake/tasks/time_splitter.py b/proteinshake/tasks/time_splitter.py