Merge pull request #17 from LinkunGao/dev

Dev
SPARC-FAIR-Codeathon · Sep 13, 2023 · 270ad51 · 270ad51
2 parents 7aea503 + 82a634a
commit 270ad51
Show file tree

Hide file tree

Showing 3 changed files with 209 additions and 166 deletions.
diff --git a/examples/example_for_updating_metadata.py b/examples/example_for_updating_metadata.py
@@ -3,76 +3,82 @@
 
 
 def add_values_dataset_description(dataset_description):
-    dataset_description.add_values("2.0.0", row_name='metadataversion')
-    dataset_description.add_values("experimental", row_name='type')
-    dataset_description.add_values("Duke breast cancer MRI preprocessing", row_name='Title')
-    dataset_description.add_values("""Preprocessing the breast cancer MRI images and saving in Nifti format""",
-                                   row_name='subtitle')
-    dataset_description.add_values("Breast cancer", "image processing", row_name='Keywords')
-    dataset_description.add_values("""Preprocessing the breast cancer MRI images and saving in Nifti format""",
-                                   row_name="Study purpose")
-    dataset_description.add_values("The result is great.", row_name="Study primary conclusion")
-    dataset_description.add_values("derived from Duke Breast Cancer MRI dataset",
-                                   row_name='Study data Collection')
-    dataset_description.add_values("NA", row_name='Study primary conclusion')
-    dataset_description.add_values("NA", row_name='Study primary conclusion', append=True)
-    dataset_description.add_values("breast", row_name='Study organ system')
-    dataset_description.add_values("image processing", row_name='Study approach')
-    dataset_description.add_values("""dicom2nifti""", row_name='Study technique')
-    dataset_description.add_values("Lin, Chinchien", "Gao, Linkun", row_name='contributorname')
-    dataset_description.add_values("Prasad", "Jiali", row_name='contributorNAME', append=True)
-    dataset_description.add_values(*["bob", "db"], row_name="contributor name", append=True)
+    dataset_description.add_values('metadataversion', "2.0.0")
+    dataset_description.add_values(field_name='type', values="experimental")
+    dataset_description.add_values(field_name='Title', values="Duke breast cancer MRI preprocessing")
+    dataset_description.add_values(field_name='subtitle',
+                                   values="""Preprocessing the breast cancer MRI images and saving in Nifti format""")
+    dataset_description.add_values(field_name='Keywords', values=["Breast cancer", "image processing"])
+    dataset_description.add_values(field_name="Study purpose",
+                                   values="""Preprocessing the breast cancer MRI images and saving in Nifti format""")
+    dataset_description.add_values(field_name="Study primary conclusion", values="The result is great.")
+    dataset_description.add_values(field_name='Study data Collection',
+                                   values="derived from Duke Breast Cancer MRI dataset")
+    dataset_description.add_values(field_name='Study primary conclusion', values="NA")
+    dataset_description.add_values(field_name='Study primary conclusion', values="NA")
+    dataset_description.add_values(field_name='Study organ system', values="breast")
+    dataset_description.add_values(field_name='Study approach', values="image processing")
+    dataset_description.add_values(field_name='Study technique', values="dicom2nifti", )
+    dataset_description.add_values(field_name='contributorname', values=["Lin, Chinchien", "Gao, Linkun"])
+    dataset_description.add_values(field_name='contributorNAME', values=["Prasad", "Jiali"])
+    dataset_description.add_values(field_name="contributor name", values=["bob", "db"])
     dataset_description.add_values(
-        "https://orcid.org/0000-0001-8170-199X",
-        "https://orcid.org/0000-0001-8171-199X",
-        "https://orcid.org/0000-0001-8172-199X",
-        "https://orcid.org/0000-0001-8173-199X",
-        "https://orcid.org/0000-0001-8174-199X",
-        "https://orcid.org/0000-0001-8176-199X",
-        row_name='Contributor orcid')
-
-    dataset_description.add_values(*["University of Auckland"] * 6, row_name='Contributor affiliation')
-    dataset_description.add_values(*["developer", "developer", "Researcher", "Researcher", "tester", "tester"],
-                                   row_name="contributor role")
-    dataset_description.add_values("source", row_name='Identifier description')
-    dataset_description.add_values("WasDerivedFrom", row_name='Relation type')
-    dataset_description.add_values("DTP-UUID", row_name='Identifier')
-    dataset_description.add_values("12L digital twin UUID", row_name='Identifier type')
-    dataset_description.add_values("1", row_name='Number of subjects')
-    dataset_description.add_values("1", row_name='Number of samples')
+        field_name='Contributor orcid',
+        values=["https://orcid.org/0000-0001-8170-199X",
+                "https://orcid.org/0000-0001-8171-199X",
+                "https://orcid.org/0000-0001-8172-199X",
+                "https://orcid.org/0000-0001-8173-199X",
+                "https://orcid.org/0000-0001-8174-199X",
+                "https://orcid.org/0000-0001-8176-199X"],
+        append=False)
+
+    dataset_description.add_values(field_name='Contributor affiliation', values=["University of Auckland"] * 6, )
+    dataset_description.add_values(field_name="contributor role",
+                                   values=["developer", "developer", "Researcher", "Researcher", "tester", "tester"])
+    dataset_description.add_values(field_name='Identifier description', values="source")
+    dataset_description.add_values(field_name='Relation type', values="WasDerivedFrom")
+    dataset_description.add_values(field_name='Identifier', values="DTP-UUID")
+    dataset_description.add_values(field_name='Identifier type', values="12L digital twin UUID")
 
 
 def add_values_for_sample_metadata(sample_metadata):
-    sample_metadata.add_values(*["test"] * 6, col_name="was derived from", append=False)
-    sample_metadata.add_values(*["pool id 1", "pool id 2", "pool id 3", "pool id 4", "pool id 5", "pool id 6"],
-                               col_name="pool id", append=False)
-    sample_metadata.add_values(*["Yes"] * 5, "No", col_name="also in dataset", append=False)
-    sample_metadata.add_values(*["Global"] * 6, col_name="member of", append=False)
+    sample_metadata.add_values(field_name="was derived from", values=["test"] * 6, append=False)
+    sample_metadata.add_values(field_name="pool id",
+                               values=["pool id 1", "pool id 2", "pool id 3", "pool id 4", "pool id 5", "pool id 6"],
+                               append=False)
+    sample_metadata.add_values(field_name="also in dataset", values=[*["Yes"] * 5, "No"], append=False)
+    sample_metadata.add_values(field_name="member of", values=["Global"] * 6, append=False)
     sample_metadata.add_values(
-        *["laboratory 1", "laboratory 2", "laboratory 3", "laboratory 4", "laboratory 5", "laboratory 6"],
-        col_name="laboratory internal id", append=False)
-    sample_metadata.add_values(*["1991-05-25"] * 3, *["1991-06-10"] * 3, col_name="date of derivation", append=False)
+        field_name="laboratory internal id",
+        values=["laboratory 1", "laboratory 2", "laboratory 3", "laboratory 4", "laboratory 5", "laboratory 6"],
+        append=False)
+    sample_metadata.add_values(field_name="date of derivation", values=[*["1991-05-25"] * 3, *["1991-06-10"] * 3],
+                               append=False)
 
     sample_metadata.save()
 
+
 def add_values_for_subject_metadata(subject_metadata):
-    subject_metadata.add_values("test-xyz", col_name='subject experimental group', append=False)
-    subject_metadata.add_values("30", col_name='age', append=False)
-    subject_metadata.add_values("M", col_name='sex', append=False)
-    subject_metadata.add_values("P", col_name='species', append=False)
-    subject_metadata.add_values("test", col_name='strain', append=False)
-    subject_metadata.add_values("old", col_name="age category", append=False)
-    subject_metadata.add_values(*["pool id 1", "pool id 2", "pool id 3"],
-                               col_name="pool id", append=False)
-    subject_metadata.add_values(*["Yes"] * 3, col_name="also in dataset", append=False)
-    subject_metadata.add_values(*["515dsd1515","da515daa69", "515dsa62a"], col_name="RRID for strain", append=False)
-    subject_metadata.add_values(*["Global"] * 3, col_name="member of", append=False)
+    subject_metadata.add_values(field_name='subject experimental group', values="test-xyz", append=False)
+    subject_metadata.add_values(field_name='age', values="30", append=False)
+    subject_metadata.add_values(field_name='sex', values="Male", append=False)
+    subject_metadata.add_values(field_name='species', values="P", append=False)
+    subject_metadata.add_values(field_name='strain', values="test", append=False)
+    subject_metadata.add_values(field_name="age category", values="old", append=False)
+    subject_metadata.add_values(field_name="pool id", values=["pool id 1", "pool id 2", "pool id 3"],
+                                append=False)
+    subject_metadata.add_values(field_name="also in dataset", values=["Yes"] * 3, append=False)
+    subject_metadata.add_values(field_name="RRID for strain", values=["515dsd1515", "da515daa69", "515dsa62a"],
+                                append=False)
+    subject_metadata.add_values(field_name="member of", values=["Global"] * 3, append=False)
     subject_metadata.add_values(
-        *["laboratory 1", "laboratory 2", "laboratory 3"],
-        col_name="laboratory internal id", append=False)
-    subject_metadata.add_values(*["1996-03-25","1995-09-05", "1996-04-11"], col_name="date of birth", append=False)
+        field_name="laboratory internal id", values=["laboratory 1", "laboratory 2", "laboratory 3"],
+        append=False)
+    subject_metadata.add_values(field_name="date of birth", values=["1996-03-25", "1995-09-05", "1996-04-11"],
+                                append=False)
     subject_metadata.save()
 
+
 if __name__ == '__main__':
     save_dir = "./tmp/template/"
 
@@ -131,8 +137,8 @@ def add_values_for_subject_metadata(subject_metadata):
     # print(code_description.get_values(field_name="TSR1: Define Context Clearly Rating (0-4)"))
 
     # NOTE: Step6, remove values in specific header/row_name, code_parameters
-    dataset_description.remove_values("tester", field_name="contributor role")
-    # code_parameters.remove_values("test1_name", field_name="name")
+    dataset_description.remove_values( field_name="contributor role", values="tester")
+    # code_parameters.remove_values(field_name="name", values="test1_name")
     # Step6, remove entire values in code_parameters_editor
     # code_parameters.clear_values()
     # Step6, remove entire values in dataset_description_editor
@@ -166,7 +172,7 @@ def add_values_for_subject_metadata(subject_metadata):
                          subjects=["1", "sub-2"], subject_metadata={
             "subject experimental group": "experimental",
             "age": "041Y",
-            "sex": "F",
+            "sex": "Female",
             "species": "human",
             "strain": "tissue",
             "age category": "middle adulthood"
@@ -189,18 +195,18 @@ def add_values_for_subject_metadata(subject_metadata):
 
     dataset.add_thumbnail("./test_data/thumbnail_0.jpg")
     dataset.add_thumbnail("./test_data/thumbnail_1.jpg")
-    dataset.delete_data("./tmp/template/primary/thumbnail_0.jpg")
+    # dataset.delete_data("./tmp/template/docs/thumbnail_0.jpg")
     # NOTE: Step9 Delete folder
     # Step9.1 Delete subject folder
     # dataset.delete_subject("./tmp/template/primary/subject-xyz")
     # Step9.2 Delete sample folder
     # dataset.delete_samples(["./tmp/template/primary/subject-1/func"])
 
+    # dataset_description.clear_values()
     dataset.save()
 
     # NOTE: Step10 validate dataset via schema
     description_meta = schema.load_data("./tmp/template/dataset_description.xlsx")
     validator.validate(description_meta, category="dataset_description", version="2.0.0")
     sub_meta = schema.load_data("./tmp/template/subjects.xlsx")
     validator.validate(sub_meta, category="subjects", version="2.0.0")
-
diff --git a/sparc_me/core/dataset.py b/sparc_me/core/dataset.py
@@ -52,6 +52,12 @@ def get_dataset_path(self):
         """
         return str(self._dataset_path)
 
+    def get_dataset(self):
+        """
+        :return: current dataset dict
+        """
+        return self._dataset
+
     def _get_template_dir(self, version):
         """
         Get template directory path
@@ -143,7 +149,6 @@ def load_from_template(self, version):
 
         self._generate_metadata()
 
-        return self._dataset
 
     def _convert_version_format(self, version):
         """
@@ -419,7 +424,7 @@ def _generate_metadata(self):
         categories = self.list_categories(self._version, print_list=False)
         for category in categories:
             metadata = self._dataset.get(category).get("metadata")
-            self._metadata[category] = Metadata(category, metadata, self._dataset_path)
+            self._metadata[category] = Metadata(category, metadata, self._version, self._dataset_path)
 
     def get_metadata(self, category):
         """
@@ -530,15 +535,16 @@ def append(self, category, row, check_exist=False, unique_column=None):
             msg = "Dataset not defined. Please load the dataset in advance."
             raise ValueError(msg)
 
-        metadata = self._dataset.get(category).get("metadata")
+        # metadata = self._dataset.get(category).get("metadata")
+        category_metadata = self.get_metadata(category)
         if check_exist:
             # In version 1, the unique column is not the column 0. Hence, unique column must be specified
             if unique_column is None:
                 error_msg = "Provide which column in category is unique. Ex: subject_id"
                 raise ValueError(error_msg)
 
             try:
-                row_index = check_row_exist(metadata, unique_column, unique_value=row[unique_column])
+                row_index = check_row_exist(category_metadata.metadata, unique_column, unique_value=row[unique_column])
             except ValueError:
                 error_msg = "Row values provided does not contain a unique identifier"
                 raise ValueError(error_msg)
@@ -548,14 +554,14 @@ def append(self, category, row, check_exist=False, unique_column=None):
         if row_index == -1:
             # Add row
             row_df = pd.DataFrame([row])
-            metadata = pd.concat([metadata, row_df], axis=0,
+            category_metadata.metadata = pd.concat([category_metadata.metadata, row_df], axis=0,
                                  ignore_index=True)  # If new header comes, it will be added as a new column with its value
         else:
             # Append row with additional values
             for key, value in row.items():
-                metadata.loc[row_index, key] = value
+                category_metadata.metadata.loc[row_index, key] = value
 
-        self._dataset[category]["metadata"] = metadata
+        self._dataset[category]["metadata"] = category_metadata.metadata
         return self._dataset
 
     def update_by_json(self, category, json_file):
@@ -728,7 +734,7 @@ def add_primary_data(self, source_path, subject, sample, copy=True, overwrite=Tr
         if not os.path.exists(subjects_file_path):
             self.generate_file_from_template(subjects_file_path, 'subjects')
 
-        self.load_dataset(dataset_path=self._dataset_path, from_template=False, version=self._version)
+        # self.load_dataset(dataset_path=self._dataset_path, from_template=False, version=self._version)
 
         if not sample_metadata:
             self.append(
@@ -811,7 +817,7 @@ def add_thumbnail(self, source_path, copy=True, overwrite=True):
             raise ValueError(msg)
         else:
             filename = file_source_path.name
-            destination_path = self._dataset_path.joinpath('primary', filename)
+            destination_path = self._dataset_path.joinpath('docs', filename)
             if destination_path.exists():
                 if overwrite:
                     self._delete_data(destination_path)
@@ -1083,8 +1089,8 @@ def _update_sub_sam_nums_in_dataset_description(self, primary_folder):
                 folders = get_sub_folder_paths_in_folder(sub)
                 sample_folders.extend(folders)
         dataset_description_metadata = self._metadata["dataset_description"]
-        dataset_description_metadata.add_values(len(subject_folders), row_name="Number of subjects",
-                                                col_name='Value', append=False)
-        dataset_description_metadata.add_values(len(sample_folders), row_name="Number of samples",
-                                                col_name='Value', append=False)
+        dataset_description_metadata.add_values(field_name="Number of subjects",values=len(subject_folders),
+                                                append=False)
+        dataset_description_metadata.add_values(field_name="Number of samples", values=len(sample_folders),
+                                                append=False)
         dataset_description_metadata.save()