Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sovle the add_path issue and get schema, #108

Merged
merged 6 commits into from
Sep 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 46 additions & 14 deletions examples/example_for_create_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,10 @@ def add_values_dataset_description(dataset_description):
# code_parameters = dataset.get_metadata(metadata_file="code_parameters")
# code_description = dataset.get_metadata(metadata_file="code_description")

des_schema = schema.get_schema("dataset_description")
des_schema.get('subtitle')
print("******************************************")
des_schema = schema.get_schema("dataset_description", name_only=False)
print(des_schema)


# NOTE: Step3.1(optional), remove entire values in dataset_description
dataset_description.clear_values()
Expand Down Expand Up @@ -191,19 +193,49 @@ def add_values_dataset_description(dataset_description):
# add_values_for_subject_metadata(subject_metadata)

# New function for add subjects and samples
# subjects = []
# for subject_user_id in [1, 2]:
# samples = []
# for sample_user_id in [1, 2]:
# sample = sm.Sample()
# sample.add_path(
# "./test_data/bids_data/sub-0{0}/sequence{1}/".format(
# subject_user_id, sample_user_id))
# samples.append(sample)
#
# subject = sm.Subject()
# subject.add_samples(samples)
# subjects.append(subject)

subjects = []
for subject_user_id in [1, 2]:
samples = []
for sample_user_id in [1, 2]:
sample = sm.Sample()
sample.add_path(
"./test_data/bids_data/sub-0{0}/sequence{1}/".format(
subject_user_id, sample_user_id))
samples.append(sample)

subject = sm.Subject()
subject.add_samples(samples)
subjects.append(subject)
samples = []

sample1 = sm.Sample()
sample1.add_path("./test_data/bids_data/sub-01/sequence1/")
sample1.add_path("./test_data/sample2/raw/dummy_sam2.txt")
samples.append(sample1)

sample2 = sm.Sample()
sample2.add_path("./test_data/bids_data/sub-01/sequence2/")
samples.append(sample2)

subject1 = sm.Subject()
subject1.add_samples(samples)
subjects.append(subject1)

samples = []

sample1 = sm.Sample()
sample1.add_path("./test_data/bids_data/sub-02/sequence1/")
samples.append(sample1)

sample2 = sm.Sample()
sample2.add_path("./test_data/bids_data/sub-02/sequence2/")
samples.append(sample2)

subject2 = sm.Subject()
subject2.add_samples(samples)
subjects.append(subject2)

dataset.add_subjects(subjects)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

setup(
name="sparc_me",
version="2.2.3",
version="2.2.8",
description='A python tool to explore, enhance, and expand SPARC datasets and their descriptions in accordance with FAIR principles.',
author="Thiranja Prasad Babarenda Gamage, Chinchien Lin, Savindi Wijenayaka, Michael Hoffman, Linkun Gao, Haribalan Kumar",
email="[email protected], [email protected]",
Expand Down
103 changes: 60 additions & 43 deletions sparc_me/core/api_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,13 @@ def __init__(self):
pass

def get_dataset_versions_pensieve(self, datasetId):
'''
get one dataset all versions
"""
get one dataset all versions

:param datasetId: the dataset id from SPARC
:type datasetId: str|int
:return: versions
'''
"""

if not isinstance(datasetId, str):
datasetId = str(datasetId)
Expand All @@ -33,15 +36,16 @@ def get_dataset_versions_pensieve(self, datasetId):
return versions

def get_all_datasets_all_versions(self):
'''
Get all datasets with all versions
It may cost a few minutes to get the whole data,
Because some dataset have a lot of versions, e.g, 20,
And every time when the version number getter than 1,
it will request server for getting new data, so it waste a lot of time.
"""

Get all datasets with all versions
It may cost a few minutes to get the whole data,
Because some dataset have a lot of versions, e.g, 20,
And every time when the version number getter than 1,
it will request server for getting new data, so it waste a lot of time.

:return: datasets
'''
"""
datasets = []

latest_datasets = self.get_all_datasets_latest_version_pensieve()
Expand All @@ -56,10 +60,11 @@ def get_all_datasets_all_versions(self):
return datasets

def get_all_datasets_latest_version_pensieve(self):
'''
Get all datasets with latest version
"""
Get all datasets with latest version

:return: datasets | []
'''
"""

url = "https://api.pennsieve.io/discover/datasets?limit=2147483647&offset=0&orderBy=relevance&orderDirection=desc"

Expand All @@ -77,10 +82,13 @@ def get_all_datasets_latest_version_pensieve(self):
return []

def get_dataset_latest_version_pensieve(self, datasetId):
'''
:parameter: datasetId : String
:return:
'''
"""

:param datasetId: the dataset id from SPARC
:type datasetId: str|int
:return:
"""

if isinstance(datasetId, int):
datasetId = str(datasetId)
elif isinstance(datasetId, str):
Expand All @@ -97,10 +105,15 @@ def get_dataset_latest_version_pensieve(self, datasetId):
return json.loads(response.text)

def get_metadata_pensieve(self, datasetId, versionId):
'''
Get a metadata from the specific version
:return: metadata json format
'''
"""
Get a metadata from the specific version

:param datasetId:
:type datasetId: str | int
:param versionId:
:type versionId: str | int
:return: metadata json format
"""

if not isinstance(datasetId, str):
datasetId = str(datasetId)
Expand Down Expand Up @@ -129,10 +142,14 @@ def get_dataset_latest_version_number(self, datasetId):
versionId = ""
return versionId

def download_file(self, datasetId, filepath):
'''
Download bytes files from Pennsieve
'''
def _download_file(self, datasetId, filepath):
"""
Download bytes files from Pennsieve

:param datasetId:
:param filepath: file path from
:return:
"""
versionId = self.get_dataset_latest_version_number(datasetId)

url = "https://api.pennsieve.io/zipit/discover"
Expand All @@ -150,13 +167,14 @@ def download_file(self, datasetId, filepath):
return response.reason

def get_xlsx_csv_file_pennsieve(self, datasetId, filepath, savepath):
'''
store excel file locally
:param datasetId:
:param filepath:
:param savepath:
:return:
'''
"""

store excel file locally
:param datasetId: dataset id from SPARC
:param filepath: dataset version from SPARC
:param savepath: Path for save dataset
"""

pathList = filepath.split('.')
extension = pathList[1]
fileStrList = filepath.split('/')
Expand All @@ -169,8 +187,8 @@ def get_xlsx_csv_file_pennsieve(self, datasetId, filepath, savepath):

save_dir = Path(savepath)
if not save_dir.is_dir():
save_dir.mkdir(parents=True, exist_ok=False)
response = self.download_file(datasetId, filepath)
save_dir._mkdir(parents=True, exist_ok=False)
response = self._download_file(datasetId, filepath)

if extension == "xlsx":
with io.BytesIO(response.content) as fh:
Expand All @@ -186,7 +204,7 @@ def get_xlsx_csv_file_pennsieve(self, datasetId, filepath, savepath):
df.to_csv(savepath + filename, sep=',', header=False, index=False)

def get_UBERONs_From_Dataset(self, datasetId, filepath):
response = self.download_file(datasetId, filepath)
response = self._download_file(datasetId, filepath)
with io.BytesIO(response.content) as fh:
df = pd.read_csv(fh)
df = df.dropna(axis=0, how='any')
Expand All @@ -196,7 +214,7 @@ def get_UBERONs_From_Dataset(self, datasetId, filepath):
TODO: download whole dataset
'''

def mkdir(self, paths):
def _mkdir(self, paths):
for path in paths:
savepath = "dataset/"
fileStrList = path.split('/')
Expand All @@ -223,7 +241,7 @@ def get_all_files_path(self, dataset_id, version_id):
paths.append(files[idx]["path"])
return paths

def craw(self, datasetId, versionId, url_queue: queue.Queue, html_queue: queue.Queue):
def _craw(self, datasetId, versionId, url_queue: queue.Queue, html_queue: queue.Queue):
'''
Download bytes files from Pennsieve
'''
Expand Down Expand Up @@ -254,8 +272,7 @@ def craw(self, datasetId, versionId, url_queue: queue.Queue, html_queue: queue.Q
except Exception as e:
print(f"The file: {filepath} download failed! The error is {e}")


def parse(self, html_queue: queue.Queue):
def _parse(self, html_queue: queue.Queue):
while True:
res = html_queue.get()
if res is None:
Expand All @@ -282,7 +299,7 @@ def download_dataset(self, dataset_id, version_id=None):
print("Invalid version id, Now will download the first version of the dataset for you!")

paths = self.get_all_files_path(dataset_id, version_id)
self.mkdir(paths)
self._mkdir(paths)
url_queue = queue.Queue()
html_queue = queue.Queue()
threads = []
Expand All @@ -295,11 +312,11 @@ def download_dataset(self, dataset_id, version_id=None):
url_queue.put(None)

for idx in range(3):
t1 = threading.Thread(target=self.craw, args=(dataset_id, version_id, url_queue, html_queue))
t1 = threading.Thread(target=self._craw, args=(dataset_id, version_id, url_queue, html_queue))
threads.append(t1)
t1.start()
for idx in range(2):
t2 = threading.Thread(target=self.parse, args=(html_queue,))
t2 = threading.Thread(target=self._parse, args=(html_queue,))
t2.start()

for t in threads:
Expand All @@ -318,7 +335,7 @@ def get_dataset_protocolsio_link(self, datasetId):
def get_protocolsio_text(self, datasetId, dir):
save_dir = Path(dir)
if not save_dir.is_dir():
save_dir.mkdir(parents=True, exist_ok=False)
save_dir._mkdir(parents=True, exist_ok=False)

protocol_url = self.get_dataset_protocolsio_link(datasetId)
if protocol_url:
Expand Down
Loading