Skip to content

Commit

Permalink
Merge pull request #17 from LinkunGao/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
LinkunGao authored Sep 13, 2023
2 parents 7aea503 + 82a634a commit 270ad51
Show file tree
Hide file tree
Showing 3 changed files with 209 additions and 166 deletions.
132 changes: 69 additions & 63 deletions examples/example_for_updating_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,76 +3,82 @@


def add_values_dataset_description(dataset_description):
dataset_description.add_values("2.0.0", row_name='metadataversion')
dataset_description.add_values("experimental", row_name='type')
dataset_description.add_values("Duke breast cancer MRI preprocessing", row_name='Title')
dataset_description.add_values("""Preprocessing the breast cancer MRI images and saving in Nifti format""",
row_name='subtitle')
dataset_description.add_values("Breast cancer", "image processing", row_name='Keywords')
dataset_description.add_values("""Preprocessing the breast cancer MRI images and saving in Nifti format""",
row_name="Study purpose")
dataset_description.add_values("The result is great.", row_name="Study primary conclusion")
dataset_description.add_values("derived from Duke Breast Cancer MRI dataset",
row_name='Study data Collection')
dataset_description.add_values("NA", row_name='Study primary conclusion')
dataset_description.add_values("NA", row_name='Study primary conclusion', append=True)
dataset_description.add_values("breast", row_name='Study organ system')
dataset_description.add_values("image processing", row_name='Study approach')
dataset_description.add_values("""dicom2nifti""", row_name='Study technique')
dataset_description.add_values("Lin, Chinchien", "Gao, Linkun", row_name='contributorname')
dataset_description.add_values("Prasad", "Jiali", row_name='contributorNAME', append=True)
dataset_description.add_values(*["bob", "db"], row_name="contributor name", append=True)
dataset_description.add_values('metadataversion', "2.0.0")
dataset_description.add_values(field_name='type', values="experimental")
dataset_description.add_values(field_name='Title', values="Duke breast cancer MRI preprocessing")
dataset_description.add_values(field_name='subtitle',
values="""Preprocessing the breast cancer MRI images and saving in Nifti format""")
dataset_description.add_values(field_name='Keywords', values=["Breast cancer", "image processing"])
dataset_description.add_values(field_name="Study purpose",
values="""Preprocessing the breast cancer MRI images and saving in Nifti format""")
dataset_description.add_values(field_name="Study primary conclusion", values="The result is great.")
dataset_description.add_values(field_name='Study data Collection',
values="derived from Duke Breast Cancer MRI dataset")
dataset_description.add_values(field_name='Study primary conclusion', values="NA")
dataset_description.add_values(field_name='Study primary conclusion', values="NA")
dataset_description.add_values(field_name='Study organ system', values="breast")
dataset_description.add_values(field_name='Study approach', values="image processing")
dataset_description.add_values(field_name='Study technique', values="dicom2nifti", )
dataset_description.add_values(field_name='contributorname', values=["Lin, Chinchien", "Gao, Linkun"])
dataset_description.add_values(field_name='contributorNAME', values=["Prasad", "Jiali"])
dataset_description.add_values(field_name="contributor name", values=["bob", "db"])
dataset_description.add_values(
"https://orcid.org/0000-0001-8170-199X",
"https://orcid.org/0000-0001-8171-199X",
"https://orcid.org/0000-0001-8172-199X",
"https://orcid.org/0000-0001-8173-199X",
"https://orcid.org/0000-0001-8174-199X",
"https://orcid.org/0000-0001-8176-199X",
row_name='Contributor orcid')

dataset_description.add_values(*["University of Auckland"] * 6, row_name='Contributor affiliation')
dataset_description.add_values(*["developer", "developer", "Researcher", "Researcher", "tester", "tester"],
row_name="contributor role")
dataset_description.add_values("source", row_name='Identifier description')
dataset_description.add_values("WasDerivedFrom", row_name='Relation type')
dataset_description.add_values("DTP-UUID", row_name='Identifier')
dataset_description.add_values("12L digital twin UUID", row_name='Identifier type')
dataset_description.add_values("1", row_name='Number of subjects')
dataset_description.add_values("1", row_name='Number of samples')
field_name='Contributor orcid',
values=["https://orcid.org/0000-0001-8170-199X",
"https://orcid.org/0000-0001-8171-199X",
"https://orcid.org/0000-0001-8172-199X",
"https://orcid.org/0000-0001-8173-199X",
"https://orcid.org/0000-0001-8174-199X",
"https://orcid.org/0000-0001-8176-199X"],
append=False)

dataset_description.add_values(field_name='Contributor affiliation', values=["University of Auckland"] * 6, )
dataset_description.add_values(field_name="contributor role",
values=["developer", "developer", "Researcher", "Researcher", "tester", "tester"])
dataset_description.add_values(field_name='Identifier description', values="source")
dataset_description.add_values(field_name='Relation type', values="WasDerivedFrom")
dataset_description.add_values(field_name='Identifier', values="DTP-UUID")
dataset_description.add_values(field_name='Identifier type', values="12L digital twin UUID")


def add_values_for_sample_metadata(sample_metadata):
sample_metadata.add_values(*["test"] * 6, col_name="was derived from", append=False)
sample_metadata.add_values(*["pool id 1", "pool id 2", "pool id 3", "pool id 4", "pool id 5", "pool id 6"],
col_name="pool id", append=False)
sample_metadata.add_values(*["Yes"] * 5, "No", col_name="also in dataset", append=False)
sample_metadata.add_values(*["Global"] * 6, col_name="member of", append=False)
sample_metadata.add_values(field_name="was derived from", values=["test"] * 6, append=False)
sample_metadata.add_values(field_name="pool id",
values=["pool id 1", "pool id 2", "pool id 3", "pool id 4", "pool id 5", "pool id 6"],
append=False)
sample_metadata.add_values(field_name="also in dataset", values=[*["Yes"] * 5, "No"], append=False)
sample_metadata.add_values(field_name="member of", values=["Global"] * 6, append=False)
sample_metadata.add_values(
*["laboratory 1", "laboratory 2", "laboratory 3", "laboratory 4", "laboratory 5", "laboratory 6"],
col_name="laboratory internal id", append=False)
sample_metadata.add_values(*["1991-05-25"] * 3, *["1991-06-10"] * 3, col_name="date of derivation", append=False)
field_name="laboratory internal id",
values=["laboratory 1", "laboratory 2", "laboratory 3", "laboratory 4", "laboratory 5", "laboratory 6"],
append=False)
sample_metadata.add_values(field_name="date of derivation", values=[*["1991-05-25"] * 3, *["1991-06-10"] * 3],
append=False)

sample_metadata.save()


def add_values_for_subject_metadata(subject_metadata):
subject_metadata.add_values("test-xyz", col_name='subject experimental group', append=False)
subject_metadata.add_values("30", col_name='age', append=False)
subject_metadata.add_values("M", col_name='sex', append=False)
subject_metadata.add_values("P", col_name='species', append=False)
subject_metadata.add_values("test", col_name='strain', append=False)
subject_metadata.add_values("old", col_name="age category", append=False)
subject_metadata.add_values(*["pool id 1", "pool id 2", "pool id 3"],
col_name="pool id", append=False)
subject_metadata.add_values(*["Yes"] * 3, col_name="also in dataset", append=False)
subject_metadata.add_values(*["515dsd1515","da515daa69", "515dsa62a"], col_name="RRID for strain", append=False)
subject_metadata.add_values(*["Global"] * 3, col_name="member of", append=False)
subject_metadata.add_values(field_name='subject experimental group', values="test-xyz", append=False)
subject_metadata.add_values(field_name='age', values="30", append=False)
subject_metadata.add_values(field_name='sex', values="Male", append=False)
subject_metadata.add_values(field_name='species', values="P", append=False)
subject_metadata.add_values(field_name='strain', values="test", append=False)
subject_metadata.add_values(field_name="age category", values="old", append=False)
subject_metadata.add_values(field_name="pool id", values=["pool id 1", "pool id 2", "pool id 3"],
append=False)
subject_metadata.add_values(field_name="also in dataset", values=["Yes"] * 3, append=False)
subject_metadata.add_values(field_name="RRID for strain", values=["515dsd1515", "da515daa69", "515dsa62a"],
append=False)
subject_metadata.add_values(field_name="member of", values=["Global"] * 3, append=False)
subject_metadata.add_values(
*["laboratory 1", "laboratory 2", "laboratory 3"],
col_name="laboratory internal id", append=False)
subject_metadata.add_values(*["1996-03-25","1995-09-05", "1996-04-11"], col_name="date of birth", append=False)
field_name="laboratory internal id", values=["laboratory 1", "laboratory 2", "laboratory 3"],
append=False)
subject_metadata.add_values(field_name="date of birth", values=["1996-03-25", "1995-09-05", "1996-04-11"],
append=False)
subject_metadata.save()


if __name__ == '__main__':
save_dir = "./tmp/template/"

Expand Down Expand Up @@ -131,8 +137,8 @@ def add_values_for_subject_metadata(subject_metadata):
# print(code_description.get_values(field_name="TSR1: Define Context Clearly Rating (0-4)"))

# NOTE: Step6, remove values in specific header/row_name, code_parameters
dataset_description.remove_values("tester", field_name="contributor role")
# code_parameters.remove_values("test1_name", field_name="name")
dataset_description.remove_values( field_name="contributor role", values="tester")
# code_parameters.remove_values(field_name="name", values="test1_name")
# Step6, remove entire values in code_parameters_editor
# code_parameters.clear_values()
# Step6, remove entire values in dataset_description_editor
Expand Down Expand Up @@ -166,7 +172,7 @@ def add_values_for_subject_metadata(subject_metadata):
subjects=["1", "sub-2"], subject_metadata={
"subject experimental group": "experimental",
"age": "041Y",
"sex": "F",
"sex": "Female",
"species": "human",
"strain": "tissue",
"age category": "middle adulthood"
Expand All @@ -189,18 +195,18 @@ def add_values_for_subject_metadata(subject_metadata):

dataset.add_thumbnail("./test_data/thumbnail_0.jpg")
dataset.add_thumbnail("./test_data/thumbnail_1.jpg")
dataset.delete_data("./tmp/template/primary/thumbnail_0.jpg")
# dataset.delete_data("./tmp/template/docs/thumbnail_0.jpg")
# NOTE: Step9 Delete folder
# Step9.1 Delete subject folder
# dataset.delete_subject("./tmp/template/primary/subject-xyz")
# Step9.2 Delete sample folder
# dataset.delete_samples(["./tmp/template/primary/subject-1/func"])

# dataset_description.clear_values()
dataset.save()

# NOTE: Step10 validate dataset via schema
description_meta = schema.load_data("./tmp/template/dataset_description.xlsx")
validator.validate(description_meta, category="dataset_description", version="2.0.0")
sub_meta = schema.load_data("./tmp/template/subjects.xlsx")
validator.validate(sub_meta, category="subjects", version="2.0.0")

32 changes: 19 additions & 13 deletions sparc_me/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ def get_dataset_path(self):
"""
return str(self._dataset_path)

def get_dataset(self):
"""
:return: current dataset dict
"""
return self._dataset

def _get_template_dir(self, version):
"""
Get template directory path
Expand Down Expand Up @@ -143,7 +149,6 @@ def load_from_template(self, version):

self._generate_metadata()

return self._dataset

def _convert_version_format(self, version):
"""
Expand Down Expand Up @@ -419,7 +424,7 @@ def _generate_metadata(self):
categories = self.list_categories(self._version, print_list=False)
for category in categories:
metadata = self._dataset.get(category).get("metadata")
self._metadata[category] = Metadata(category, metadata, self._dataset_path)
self._metadata[category] = Metadata(category, metadata, self._version, self._dataset_path)

def get_metadata(self, category):
"""
Expand Down Expand Up @@ -530,15 +535,16 @@ def append(self, category, row, check_exist=False, unique_column=None):
msg = "Dataset not defined. Please load the dataset in advance."
raise ValueError(msg)

metadata = self._dataset.get(category).get("metadata")
# metadata = self._dataset.get(category).get("metadata")
category_metadata = self.get_metadata(category)
if check_exist:
# In version 1, the unique column is not the column 0. Hence, unique column must be specified
if unique_column is None:
error_msg = "Provide which column in category is unique. Ex: subject_id"
raise ValueError(error_msg)

try:
row_index = check_row_exist(metadata, unique_column, unique_value=row[unique_column])
row_index = check_row_exist(category_metadata.metadata, unique_column, unique_value=row[unique_column])
except ValueError:
error_msg = "Row values provided does not contain a unique identifier"
raise ValueError(error_msg)
Expand All @@ -548,14 +554,14 @@ def append(self, category, row, check_exist=False, unique_column=None):
if row_index == -1:
# Add row
row_df = pd.DataFrame([row])
metadata = pd.concat([metadata, row_df], axis=0,
category_metadata.metadata = pd.concat([category_metadata.metadata, row_df], axis=0,
ignore_index=True) # If new header comes, it will be added as a new column with its value
else:
# Append row with additional values
for key, value in row.items():
metadata.loc[row_index, key] = value
category_metadata.metadata.loc[row_index, key] = value

self._dataset[category]["metadata"] = metadata
self._dataset[category]["metadata"] = category_metadata.metadata
return self._dataset

def update_by_json(self, category, json_file):
Expand Down Expand Up @@ -728,7 +734,7 @@ def add_primary_data(self, source_path, subject, sample, copy=True, overwrite=Tr
if not os.path.exists(subjects_file_path):
self.generate_file_from_template(subjects_file_path, 'subjects')

self.load_dataset(dataset_path=self._dataset_path, from_template=False, version=self._version)
# self.load_dataset(dataset_path=self._dataset_path, from_template=False, version=self._version)

if not sample_metadata:
self.append(
Expand Down Expand Up @@ -811,7 +817,7 @@ def add_thumbnail(self, source_path, copy=True, overwrite=True):
raise ValueError(msg)
else:
filename = file_source_path.name
destination_path = self._dataset_path.joinpath('primary', filename)
destination_path = self._dataset_path.joinpath('docs', filename)
if destination_path.exists():
if overwrite:
self._delete_data(destination_path)
Expand Down Expand Up @@ -1083,8 +1089,8 @@ def _update_sub_sam_nums_in_dataset_description(self, primary_folder):
folders = get_sub_folder_paths_in_folder(sub)
sample_folders.extend(folders)
dataset_description_metadata = self._metadata["dataset_description"]
dataset_description_metadata.add_values(len(subject_folders), row_name="Number of subjects",
col_name='Value', append=False)
dataset_description_metadata.add_values(len(sample_folders), row_name="Number of samples",
col_name='Value', append=False)
dataset_description_metadata.add_values(field_name="Number of subjects",values=len(subject_folders),
append=False)
dataset_description_metadata.add_values(field_name="Number of samples", values=len(sample_folders),
append=False)
dataset_description_metadata.save()
Loading

0 comments on commit 270ad51

Please sign in to comment.