From 4a27f200947baf350de42a34506f9fd9cc374c2e Mon Sep 17 00:00:00 2001 From: skycoco Date: Tue, 19 Sep 2023 15:30:08 +1200 Subject: [PATCH 1/3] update docs, and create delete_thumbnail() --- sparc_me/core/api_tools.py | 103 +++++++++++++++++++++---------------- sparc_me/core/dataset.py | 89 ++++++++++++++++++++++++++------ 2 files changed, 132 insertions(+), 60 deletions(-) diff --git a/sparc_me/core/api_tools.py b/sparc_me/core/api_tools.py index a3abac5..4e3b2af 100644 --- a/sparc_me/core/api_tools.py +++ b/sparc_me/core/api_tools.py @@ -14,10 +14,13 @@ def __init__(self): pass def get_dataset_versions_pensieve(self, datasetId): - ''' - get one dataset all versions + """ + get one dataset all versions + + :param datasetId: the dataset id from SPARC + :type datasetId: str|int :return: versions - ''' + """ if not isinstance(datasetId, str): datasetId = str(datasetId) @@ -33,15 +36,16 @@ def get_dataset_versions_pensieve(self, datasetId): return versions def get_all_datasets_all_versions(self): - ''' - Get all datasets with all versions - It may cost a few minutes to get the whole data, - Because some dataset have a lot of versions, e.g, 20, - And every time when the version number getter than 1, - it will request server for getting new data, so it waste a lot of time. + """ + + Get all datasets with all versions + It may cost a few minutes to get the whole data, + Because some dataset have a lot of versions, e.g, 20, + And every time when the version number getter than 1, + it will request server for getting new data, so it waste a lot of time. :return: datasets - ''' + """ datasets = [] latest_datasets = self.get_all_datasets_latest_version_pensieve() @@ -56,10 +60,11 @@ def get_all_datasets_all_versions(self): return datasets def get_all_datasets_latest_version_pensieve(self): - ''' - Get all datasets with latest version + """ + Get all datasets with latest version + :return: datasets | [] - ''' + """ url = "https://api.pennsieve.io/discover/datasets?limit=2147483647&offset=0&orderBy=relevance&orderDirection=desc" @@ -77,10 +82,13 @@ def get_all_datasets_latest_version_pensieve(self): return [] def get_dataset_latest_version_pensieve(self, datasetId): - ''' - :parameter: datasetId : String - :return: - ''' + """ + + :param datasetId: the dataset id from SPARC + :type datasetId: str|int + :return: + """ + if isinstance(datasetId, int): datasetId = str(datasetId) elif isinstance(datasetId, str): @@ -97,10 +105,15 @@ def get_dataset_latest_version_pensieve(self, datasetId): return json.loads(response.text) def get_metadata_pensieve(self, datasetId, versionId): - ''' - Get a metadata from the specific version - :return: metadata json format - ''' + """ + Get a metadata from the specific version + + :param datasetId: + :type datasetId: str | int + :param versionId: + :type versionId: str | int + :return: metadata json format + """ if not isinstance(datasetId, str): datasetId = str(datasetId) @@ -129,10 +142,14 @@ def get_dataset_latest_version_number(self, datasetId): versionId = "" return versionId - def download_file(self, datasetId, filepath): - ''' - Download bytes files from Pennsieve - ''' + def _download_file(self, datasetId, filepath): + """ + Download bytes files from Pennsieve + + :param datasetId: + :param filepath: file path from + :return: + """ versionId = self.get_dataset_latest_version_number(datasetId) url = "https://api.pennsieve.io/zipit/discover" @@ -150,13 +167,14 @@ def download_file(self, datasetId, filepath): return response.reason def get_xlsx_csv_file_pennsieve(self, datasetId, filepath, savepath): - ''' - store excel file locally - :param datasetId: - :param filepath: - :param savepath: - :return: - ''' + """ + + store excel file locally + :param datasetId: dataset id from SPARC + :param filepath: dataset version from SPARC + :param savepath: Path for save dataset + """ + pathList = filepath.split('.') extension = pathList[1] fileStrList = filepath.split('/') @@ -169,8 +187,8 @@ def get_xlsx_csv_file_pennsieve(self, datasetId, filepath, savepath): save_dir = Path(savepath) if not save_dir.is_dir(): - save_dir.mkdir(parents=True, exist_ok=False) - response = self.download_file(datasetId, filepath) + save_dir._mkdir(parents=True, exist_ok=False) + response = self._download_file(datasetId, filepath) if extension == "xlsx": with io.BytesIO(response.content) as fh: @@ -186,7 +204,7 @@ def get_xlsx_csv_file_pennsieve(self, datasetId, filepath, savepath): df.to_csv(savepath + filename, sep=',', header=False, index=False) def get_UBERONs_From_Dataset(self, datasetId, filepath): - response = self.download_file(datasetId, filepath) + response = self._download_file(datasetId, filepath) with io.BytesIO(response.content) as fh: df = pd.read_csv(fh) df = df.dropna(axis=0, how='any') @@ -196,7 +214,7 @@ def get_UBERONs_From_Dataset(self, datasetId, filepath): TODO: download whole dataset ''' - def mkdir(self, paths): + def _mkdir(self, paths): for path in paths: savepath = "dataset/" fileStrList = path.split('/') @@ -223,7 +241,7 @@ def get_all_files_path(self, dataset_id, version_id): paths.append(files[idx]["path"]) return paths - def craw(self, datasetId, versionId, url_queue: queue.Queue, html_queue: queue.Queue): + def _craw(self, datasetId, versionId, url_queue: queue.Queue, html_queue: queue.Queue): ''' Download bytes files from Pennsieve ''' @@ -254,8 +272,7 @@ def craw(self, datasetId, versionId, url_queue: queue.Queue, html_queue: queue.Q except Exception as e: print(f"The file: {filepath} download failed! The error is {e}") - - def parse(self, html_queue: queue.Queue): + def _parse(self, html_queue: queue.Queue): while True: res = html_queue.get() if res is None: @@ -282,7 +299,7 @@ def download_dataset(self, dataset_id, version_id=None): print("Invalid version id, Now will download the first version of the dataset for you!") paths = self.get_all_files_path(dataset_id, version_id) - self.mkdir(paths) + self._mkdir(paths) url_queue = queue.Queue() html_queue = queue.Queue() threads = [] @@ -295,11 +312,11 @@ def download_dataset(self, dataset_id, version_id=None): url_queue.put(None) for idx in range(3): - t1 = threading.Thread(target=self.craw, args=(dataset_id, version_id, url_queue, html_queue)) + t1 = threading.Thread(target=self._craw, args=(dataset_id, version_id, url_queue, html_queue)) threads.append(t1) t1.start() for idx in range(2): - t2 = threading.Thread(target=self.parse, args=(html_queue,)) + t2 = threading.Thread(target=self._parse, args=(html_queue,)) t2.start() for t in threads: @@ -318,7 +335,7 @@ def get_dataset_protocolsio_link(self, datasetId): def get_protocolsio_text(self, datasetId, dir): save_dir = Path(dir) if not save_dir.is_dir(): - save_dir.mkdir(parents=True, exist_ok=False) + save_dir._mkdir(parents=True, exist_ok=False) protocol_url = self.get_dataset_protocolsio_link(datasetId) if protocol_url: diff --git a/sparc_me/core/dataset.py b/sparc_me/core/dataset.py index c6eb75f..c4c7303 100644 --- a/sparc_me/core/dataset.py +++ b/sparc_me/core/dataset.py @@ -82,7 +82,7 @@ def _get_template_dir(self, version): return template_dir - def set_template_version(self, version): + def _set_template_version(self, version): """ Choose a template version @@ -146,6 +146,11 @@ def _load(self, dir_path): return dataset def create_empty_dataset(self, version='2.0.0'): + """ + Create an empty dataset from template via dataset version + :param version: the dataset version + :type version: '2.0.0' | '1.2.3' + """ self.load_from_template(version=version) def load_from_template(self, version): @@ -157,7 +162,7 @@ def load_from_template(self, version): :return: loaded dataset :rtype: dict """ - self.set_version(version) + self._set_version(version) # self._dataset_path = self._get_template_dir(self._version) template_dataset_path = self._get_template_dir(self._version) self._dataset = self._load(str(template_dataset_path)) @@ -179,7 +184,7 @@ def _convert_version_format(self, version): return version - def set_version(self, version): + def _set_version(self, version): """ Set dataset version version @@ -191,7 +196,7 @@ def set_version(self, version): self._version = version self._set_version_specific_variables(version) - def load_template(self, version): + def _load_template(self, version): """ Load template @@ -202,15 +207,16 @@ def load_template(self, version): """ version = self._convert_version_format(version) - self.set_template_version(version) + self._set_template_version(version) self._template_dir = self._get_template_dir(self._template_version) self._template = self._load(str(self._template_dir)) return self._template - def save_template(self, save_dir, version=None): + def _save_template(self, save_dir, version=None): """ Save the template directory locally + TODO: will delete later :param save_dir: path to the output folder :type save_dir: string @@ -241,7 +247,7 @@ def load_dataset(self, dataset_path=None, from_template=False, version=None): :rtype: dict """ if version: - self.set_version(version) + self._set_version(version) if not self._dataset_path: self._dataset_path = Path(dataset_path) @@ -281,7 +287,7 @@ def save(self, save_dir="", remove_empty=False, keep_style=False): data = self._filter(data, filename) if isinstance(data, pd.DataFrame): - self.set_version(self._version) + self._set_version(self._version) template_dir = self._get_template_dir(self._version) if keep_style: @@ -363,7 +369,7 @@ def list_metadata_files(self, version, print_list=True): """ metadata_files = list() - self.load_template(version=version) + self._load_template(version=version) for key, value in self._template.items(): if isinstance(value, dict): @@ -421,7 +427,7 @@ def list_elements(self, metadata_file, axis=0, version=None): return fields if not self._template: - self.load_template(version=None) + self._load_template(version=None) data = self._template.get(metadata_file) metadata = data.get("metadata") @@ -452,7 +458,15 @@ def _generate_metadata(self): def get_metadata(self, metadata_file): """ - :param metadata_file: one of string of [code_description, code_parameters, dataset_description,manifest,performances,resources,samples,subjects,submission] + Get a Metadata object based on the metadata file name + To edit values for a metadata + + :param metadata_file: one of string of [code_description, + code_parameters, + dataset_description, + manifest,performances, + resources,samples, + subjects,submission] :type metadata_file: string :return: give a metadata editor for a specific metadata """ @@ -463,9 +477,10 @@ def get_metadata(self, metadata_file): metadata_file = validate_metadata_file(metadata_file, self._version) return self._metadata[metadata_file] - def set_field(self, metadata_file, row_index, header, value): + def _set_field(self, metadata_file, row_index, header, value): """ Set single field by row idx/name and column name (the header) + TODO: will delete later :param metadata_file: metadata metadata_file :type metadata_file: string @@ -500,7 +515,7 @@ def set_field(self, metadata_file, row_index, header, value): return self._dataset - def set_field_using_row_name(self, metadata_file, row_name, header, value): + def _set_field_using_row_name(self, metadata_file, row_name, header, value): """ Set single cell. The row is identified by the given unique name and column is identified by the header. @@ -538,9 +553,9 @@ def set_field_using_row_name(self, metadata_file, row_name, header, value): raise ValueError(msg) else: excel_row_index = matching_indices[0] + 2 - return self.set_field(metadata_file=metadata_file, row_index=excel_row_index, header=header, value=value) + return self._set_field(metadata_file=metadata_file, row_index=excel_row_index, header=header, value=value) - def append(self, metadata_file, row, check_exist=False, unique_column=None): + def _append(self, metadata_file, row, check_exist=False, unique_column=None): """ Append a row to a metadata file @@ -629,8 +644,9 @@ def update_by_json(self, metadata_file, json_file): return metadata - def generate_file_from_template(self, save_path, metadata_file, data=pd.DataFrame(), keep_style=False): + def _generate_file_from_template(self, save_path, metadata_file, data=pd.DataFrame(), keep_style=False): """Generate file from a template and populate with data if givn + TODO: will delete later :param save_path: destination to save the generated file :type save_path: string @@ -653,6 +669,17 @@ def generate_file_from_template(self, save_path, metadata_file, data=pd.DataFram def add_subjects(self, subjects): + """ + Add Subejct list to dataset. + This function will add subjects and samples to metadata, + And will move the sample files from origin source path to dataset + primary subject sample folder. + It will automatically update manifest and dataset_description metadata files. + + :param subjects: Subject dataset + :type subjects: list + """ + self.save() if not isinstance(subjects, list): msg = "Please provide a list of subjects" @@ -666,6 +693,7 @@ def add_subjects(self, subjects): def get_subject(self, subject_sds_id) -> Subject: """ Get a subject by subject sds id + :param subject_sds_id: subject sds id :type subject_sds_id: str :return: Subject @@ -712,7 +740,14 @@ def add_derivative_data(self, source_path, subject, sample, copy=True, overwrite self._add_sample_data(source_path, self._dataset_path, subject, sample, data_type="derivative", copy=copy, overwrite=overwrite) - def add_element(self, metadata_file, element): + def _add_element(self, metadata_file, element): + """ + May need to delete + + :param metadata_file: + :param element: + :return: + """ metadata = self._dataset.get(metadata_file).get("metadata") if metadata_file in self._column_based: row_pd = pd.DataFrame([{"Metadata element": element}]) @@ -743,6 +778,7 @@ def add_thumbnail(self, source_path, copy=True, overwrite=True): self._modify_manifest(fname=filename, manifest_folder_path=str(self._dataset_path), destination_path=str(destination_path.parent), description=description) + def _add_sample_data(self, source_path, dataset_path, subject, sample, data_type="primary", copy=True, overwrite=True): """Copy or move data from source folder to destination folder @@ -963,7 +999,26 @@ def delete_sample(self, destination_path, data_type="primary"): samples_metadata.remove_row(sam_folder.name) samples_metadata.save() + def delete_thumbnail(self, destination_path): + """ + Delete a thumbnail from dataset + Will automatically update manifest metadata. + + :param destination_path: The thumbnail path in dataset that you want to delete. + :type destination_path: str + """ + self.delete_data(destination_path) + def delete_data(self, destination_path): + """ + Delete file based on ,the file path in dataset + It will automatically update mainfest metadata + TODO: need to connect delete sample and subject, and update subject and sample metadata + + :param destination_path: the file path that you want to delete + :type destination_path: str + :return: + """ if not Path(destination_path).exists(): msg = f"The file {str(destination_path)} is not existing" raise FileNotFoundError(msg) From dce386c7aa2b4f2d530654a8509e62cbe6490853 Mon Sep 17 00:00:00 2001 From: skycoco Date: Tue, 19 Sep 2023 16:38:30 +1200 Subject: [PATCH 2/3] fixed all #106 --- examples/example_for_create_dataset.py | 60 ++++++++++++++++++++------ setup.py | 2 +- sparc_me/core/metadata.py | 60 ++++++++++++++++++++------ sparc_me/core/schema.py | 21 +++++++-- 4 files changed, 113 insertions(+), 30 deletions(-) diff --git a/examples/example_for_create_dataset.py b/examples/example_for_create_dataset.py index bfb2748..123560b 100644 --- a/examples/example_for_create_dataset.py +++ b/examples/example_for_create_dataset.py @@ -103,8 +103,10 @@ def add_values_dataset_description(dataset_description): # code_parameters = dataset.get_metadata(metadata_file="code_parameters") # code_description = dataset.get_metadata(metadata_file="code_description") - des_schema = schema.get_schema("dataset_description") - des_schema.get('subtitle') + print("******************************************") + des_schema = schema.get_schema("dataset_description", name_only=False) + print(des_schema) + # NOTE: Step3.1(optional), remove entire values in dataset_description dataset_description.clear_values() @@ -191,19 +193,49 @@ def add_values_dataset_description(dataset_description): # add_values_for_subject_metadata(subject_metadata) # New function for add subjects and samples + # subjects = [] + # for subject_user_id in [1, 2]: + # samples = [] + # for sample_user_id in [1, 2]: + # sample = sm.Sample() + # sample.add_path( + # "./test_data/bids_data/sub-0{0}/sequence{1}/".format( + # subject_user_id, sample_user_id)) + # samples.append(sample) + # + # subject = sm.Subject() + # subject.add_samples(samples) + # subjects.append(subject) + subjects = [] - for subject_user_id in [1, 2]: - samples = [] - for sample_user_id in [1, 2]: - sample = sm.Sample() - sample.add_path( - "./test_data/bids_data/sub-0{0}/sequence{1}/".format( - subject_user_id, sample_user_id)) - samples.append(sample) - - subject = sm.Subject() - subject.add_samples(samples) - subjects.append(subject) + samples = [] + + sample1 = sm.Sample() + sample1.add_path("./test_data/bids_data/sub-01/sequence1/") + sample1.add_path("./test_data/sample2/raw/dummy_sam2.txt") + samples.append(sample1) + + sample2 = sm.Sample() + sample2.add_path("./test_data/bids_data/sub-01/sequence2/") + samples.append(sample2) + + subject1 = sm.Subject() + subject1.add_samples(samples) + subjects.append(subject1) + + samples = [] + + sample1 = sm.Sample() + sample1.add_path("./test_data/bids_data/sub-02/sequence1/") + samples.append(sample1) + + sample2 = sm.Sample() + sample2.add_path("./test_data/bids_data/sub-02/sequence2/") + samples.append(sample2) + + subject2 = sm.Subject() + subject2.add_samples(samples) + subjects.append(subject2) dataset.add_subjects(subjects) diff --git a/setup.py b/setup.py index 3605e38..dbd971e 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name="sparc_me", - version="2.2.3", + version="2.2.8", description='A python tool to explore, enhance, and expand SPARC datasets and their descriptions in accordance with FAIR principles.', author="Thiranja Prasad Babarenda Gamage, Chinchien Lin, Savindi Wijenayaka, Michael Hoffman, Linkun Gao, Haribalan Kumar", email="psam012@aucklanduni.ac.nz, clin864@aucklanduni.ac.nz", diff --git a/sparc_me/core/metadata.py b/sparc_me/core/metadata.py index e150eb2..9e9acd0 100644 --- a/sparc_me/core/metadata.py +++ b/sparc_me/core/metadata.py @@ -4,6 +4,7 @@ import shutil from sparc_me.core.utils import find_col_element from datetime import datetime, timezone +from typing import List class Metadata: @@ -406,11 +407,12 @@ class Sample: _metadata: Metadata = None _manifest_metadata: Metadata = None + def __init__(self): self.sample_id = "" self.subject_id = "" self.sample_dir = Path() - self.source_sam_dir = Path() + self.source_sample_paths: List[Path] = [] self.index = -1 def set_subject_id(self, sub_id): @@ -469,16 +471,42 @@ def add_path(self, source_path): Add sample source path to sample object :param source_path: sample folder source path - :type source_path: str + :type source_path: str | list """ - self.source_sam_dir = Path(source_path) + if isinstance(source_path, list): + for file_path in source_path: + self.source_sample_paths.append(Path(file_path)) + else: + self.source_sample_paths.append(Path(source_path)) + + + def set_path(self, source_path): + """ + Add sample source path to sample object + Override the Previous path + + :param source_path: sample folder source path + :type source_path: str | list + + """ + if isinstance(source_path, list): + self.source_sample_paths = [] + for file_path in source_path: + self.source_sample_paths.append(Path(file_path)) + else: + self.source_sample_paths = [Path(source_path)] + def set_values(self, metadata={}): """ :param metadata: key : value dict (element:value) :type metadata: dict """ + if not isinstance(metadata, dict): + msg = f"You should use a dict here, you provide parameter type is {type(metadata)}" + raise TypeError(msg) + for element, value in metadata.items(): if element == 'sample id' or element == 'subject id': continue @@ -511,15 +539,19 @@ def move(self): if not self.sample_dir.exists(): self.sample_dir.mkdir(parents=True, exist_ok=True) - source_sample_files = self.source_sam_dir.rglob("*") - for file in source_sample_files: - if file.is_file(): - relative_path = file.relative_to(self.source_sam_dir) - target_file = self.sample_dir / relative_path - target_file.parent.mkdir(parents=True, exist_ok=True) - shutil.copy(str(file), str(target_file)) - self._update_manifest(sample_path=str(target_file)) - + for source_sam in self.source_sample_paths: + if source_sam.is_dir(): + source_sample_files = source_sam.rglob("*") + for file in source_sample_files: + if file.is_file(): + relative_path = file.relative_to(source_sam) + target_file = self.sample_dir / relative_path + target_file.parent.mkdir(parents=True, exist_ok=True) + shutil.copy(str(file), str(target_file)) + self._update_manifest(sample_path=str(target_file)) + elif source_sam.is_file(): + shutil.copy(str(source_sam), str(self.sample_dir)) + self._update_manifest(sample_path=str(self.sample_dir / source_sam.name)) def _update_manifest(self, sample_path): """ Update manifest metadata, after remove samples @@ -661,6 +693,10 @@ def set_values(self, metadata={}): :param metadata: key : value dict (element:value) :type metadata: dict """ + if not isinstance(metadata, dict): + msg = f"You should use a dict here, you provide parameter type is {type(metadata)}" + raise TypeError(msg) + for element, value in metadata.items(): if element == 'subject id': continue diff --git a/sparc_me/core/schema.py b/sparc_me/core/schema.py index ff7b9e9..bad187a 100644 --- a/sparc_me/core/schema.py +++ b/sparc_me/core/schema.py @@ -52,7 +52,6 @@ def validate_dataset(self, dataset): data = schema.load_data(metadata.metadata_file_path) self.validate(data, metadata_file=metadata_file, version=metadata.version) - def validate(self, data, metadata_file, version): """ Validate data instance @@ -149,9 +148,10 @@ def get_default_schema(version, metadata_file): return schema - def get_schema(self, metadata_file, version="2.0.0", print_schema=True): + def get_schema(self, metadata_file, version="2.0.0", print_schema=True, required_only=True, name_only=True): """ get a schema via metadata_file/metadate file name + :param metadata_file: the metadata file name :type metadata_file: str :param version: "2.0.0"|"1.2.3" @@ -171,7 +171,22 @@ def get_schema(self, metadata_file, version="2.0.0", print_schema=True): with open(schema_path, 'r') as file: schema_json: Dict = json.load(file) if print_schema: - print(json.dumps(schema_json.get('properties'), indent=4)) + if required_only: + if name_only: + print(f"The required elements for {metadata_file}:") + print(json.dumps(schema_json.get('required'), indent=4)) + else: + required_items = [] + for key, value in schema_json.get('properties').items(): + if "required" in value and value["required"] == "Y": + required_items.append({key: value}) + + print(f"The required elements for {metadata_file}:") + print(json.dumps(required_items, indent=4)) + return required_items + else: + print(json.dumps(schema_json.get('properties'), indent=4)) + return CaseInsensitiveDict(schema_json.get('properties')) def set_schema(self, schema): From 74808249fb811df3410801ccfa06260413ac051f Mon Sep 17 00:00:00 2001 From: skycoco Date: Tue, 19 Sep 2023 16:43:11 +1200 Subject: [PATCH 3/3] rename delete thumbnail to remove thumbnail --- sparc_me/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparc_me/core/dataset.py b/sparc_me/core/dataset.py index c4c7303..6c31726 100644 --- a/sparc_me/core/dataset.py +++ b/sparc_me/core/dataset.py @@ -999,7 +999,7 @@ def delete_sample(self, destination_path, data_type="primary"): samples_metadata.remove_row(sam_folder.name) samples_metadata.save() - def delete_thumbnail(self, destination_path): + def remove_thumbnail(self, destination_path): """ Delete a thumbnail from dataset Will automatically update manifest metadata.