From 0dc21f7e406b27866b21239d2adad20aa8b576c0 Mon Sep 17 00:00:00 2001 From: Hugh Sorby Date: Thu, 5 Sep 2024 13:32:20 +0200 Subject: [PATCH 1/4] Ignore .DS_Store. --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index bd6ad26..7196f77 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,7 @@ dmypy.json # PyCharm .idea/ + +# macOS System files. +.DS_Store + From f0292e68430db31c430cb428d9b8e92da5ae0890 Mon Sep 17 00:00:00 2001 From: Hugh Sorby Date: Thu, 5 Sep 2024 13:33:36 +0200 Subject: [PATCH 2/4] Update requirements.txt. --- requirements.txt | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 622b50c..11851f8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,17 @@ -requests==2.27.1 algoliasearch==2.6.2 boto3==1.17.67 +botocore==1.20.112 +certifi==2022.12.7 +charset-normalizer==3.1.0 +idna==3.4 +isodate==0.6.1 +jmespath==0.10.0 mapknowledge @ https://github.com/AnatomicMaps/map-knowledge/releases/download/v0.13.1/mapknowledge-0.13.1-py3-none-any.whl +networkx==2.8.8 +pyparsing==3.0.9 +python-dateutil==2.8.2 +rdflib==6.2.0 +requests==2.28.2 +s3transfer==0.4.2 +six==1.16.0 +urllib3==1.26.14 From 0988f281c508756b2cc40f50b0f6a7834d130537 Mon Sep 17 00:00:00 2001 From: Hugh Sorby Date: Thu, 5 Sep 2024 13:34:18 +0200 Subject: [PATCH 3/4] Update isSourceOf to IsSourceOf and isDerivedFrom to IsDerivedFrom. --- README.rst | 12 ++++++------ tests/slow_tests/test_datasets_tests.py | 19 ++++++++++--------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/README.rst b/README.rst index 46ecd1c..f2fc517 100644 --- a/README.rst +++ b/README.rst @@ -87,7 +87,7 @@ ObjectErrors ------------ The object errors list provide the details of errors found in the dataset's objects list. These errors generally indicate there are problems in the file path or annotations. -ThumbnailError: Thumbnail not found in isSourceOf +ThumbnailError: Thumbnail not found in IsSourceOf ````````````````````````````````````````````````` This error occurs when the file is one of the following types:: @@ -95,10 +95,10 @@ This error occurs when the file is one of the following types:: application/x.vnd.abi.scaffold.meta+json text/vnd.abi.plot+tab-separated-values text/vnd.abi.plot+csv -Cause of the error: None of the files in the isSourceOf field of this file entry in the manifest has the mimetype - "inode/vnd.abi.plot+thumbnail". -Action: Check the manifest and make sure thumbnail entries are correctly annotated and added to the isSourceOf field of the corresponding file. +Cause of the error: None of the files in the IsSourceOf field of this file entry in the manifest has the mimetype - "inode/vnd.abi.plot+thumbnail". +Action: Check the manifest and make sure thumbnail entries are correctly annotated and added to the IsSourceOf field of the corresponding file. -ThumbnailError: Missing isSourceOf entry +ThumbnailError: Missing IsSourceOf entry ```````````````````````````````````````` This error occurs when the file is one of the following types:: @@ -106,8 +106,8 @@ This error occurs when the file is one of the following types:: application/x.vnd.abi.scaffold.meta+json text/vnd.abi.plot+tab-separated-values text/vnd.abi.plot+csv -Cause of the error: The entry of this file in the manifest does not have any entry or the entry is absent in the isSourceOf field. -Action: Check the manifest and make sure isSourceOf contains a valid thumbnail entry. +Cause of the error: The entry of this file in the manifest does not have any entry or the entry is absent in the IsSourceOf field. +Action: Check the manifest and make sure IsSourceOf contains a valid thumbnail entry. Reason: An error occurred (404) when calling the HeadObject operation: Not Found ```````````````````````````````````````````````````````````````````````````````` diff --git a/tests/slow_tests/test_datasets_tests.py b/tests/slow_tests/test_datasets_tests.py index b183e8f..e3c14f2 100644 --- a/tests/slow_tests/test_datasets_tests.py +++ b/tests/slow_tests/test_datasets_tests.py @@ -80,6 +80,7 @@ def getDatasets(start, size): return requests.post(urljoin(scicrunch_host, '_search?preference=abiknowledgetesting'), json=scicrunch_request, params=params, headers=headers) + def map_mime_type(mime_type): if mime_type == '': return NOT_SPECIFIED @@ -128,7 +129,7 @@ def getObjectMimeType(obj): mime_type = mime_type.get('name') return mime_type -#Check if any of the item in isSourceOf is a thumbnail for the object +#Check if any of the item in IsSourceOf is a thumbnail for the object def checkForThumbnail(obj, obj_list): local_mapped_type = map_mime_type(getObjectMimeType(obj)) if local_mapped_type == THUMBNAIL_IMAGE: @@ -138,8 +139,8 @@ def checkForThumbnail(obj, obj_list): if 'dataset' in obj and 'path' in obj['dataset']: localPath = obj['dataset']['path'] #Found view file, check for thumbnail - if 'datacite' in obj and 'isSourceOf' in obj['datacite']: - isSourceOf = obj['datacite']['isSourceOf'] + if 'datacite' in obj and 'IsSourceOf' in obj['datacite']: + isSourceOf = obj['datacite']['IsSourceOf'] if 'relative' in isSourceOf and 'path' in isSourceOf['relative']: for path in isSourceOf['relative']['path']: actualPath = urllib.parse.urljoin(localPath, path) @@ -151,8 +152,8 @@ def checkForThumbnail(obj, obj_list): #Generate report for datacite in the object def getDataciteReport(obj_list, obj, mapped_mimetype, filePath): - keysToCheck = { 'isDerivedFrom': 0, 'isSourceOf': 0} - reports = {'TotalErrors':0, 'ThumbnailError': 'None', 'ItemTested':0, 'isDerivedFrom': [], 'isSourceOf': [] } + keysToCheck = { 'IsDerivedFrom': 0, 'IsSourceOf': 0} + reports = {'TotalErrors':0, 'ThumbnailError': 'None', 'ItemTested':0, 'IsDerivedFrom': [], 'IsSourceOf': [] } thumbnailFound = False if 'datacite' in obj: @@ -175,7 +176,7 @@ def getDataciteReport(obj_list, obj, mapped_mimetype, filePath): } ) reports['TotalErrors'] +=1 - elif key == 'isSourceOf': + elif key == 'IsSourceOf': #Check for thumbnail thumbnailFound = checkForThumbnail(obj_list[found], obj_list) except: @@ -189,12 +190,12 @@ def getDataciteReport(obj_list, obj, mapped_mimetype, filePath): reports['TotalErrors'] +=1 if mapped_mimetype in MIMETYPE_WITH_THUMBNAILS: - if keysToCheck['isSourceOf'] == 0: - reports['ThumbnailError'] = 'Missing isSourceOf entry' + if keysToCheck['IsSourceOf'] == 0: + reports['ThumbnailError'] = 'Missing IsSourceOf entry' reports['ThumbnailErrorDetails'] = doc_link + '#thumbnailerror-missing-issourceof-entry' reports['TotalErrors'] +=1 if thumbnailFound == False: - reports['ThumbnailError'] = 'Thumbnail not found in isSourceOf' + reports['ThumbnailError'] = 'Thumbnail not found in IsSourceOf' reports['ThumbnailErrorDetails'] = doc_link + '#thumbnailerror-thumbnail-not-found-in-issourceof' reports['TotalErrors'] +=1 From 1400928be9ca17396d5357aa8219532275961d18 Mon Sep 17 00:00:00 2001 From: Hugh Sorby Date: Thu, 5 Sep 2024 13:40:04 +0200 Subject: [PATCH 4/4] Improve compliance with PEP8. --- tests/slow_tests/test_datasets_tests.py | 96 +++++++++++++------------ 1 file changed, 50 insertions(+), 46 deletions(-) diff --git a/tests/slow_tests/test_datasets_tests.py b/tests/slow_tests/test_datasets_tests.py index cfe9533..546f7a1 100644 --- a/tests/slow_tests/test_datasets_tests.py +++ b/tests/slow_tests/test_datasets_tests.py @@ -12,7 +12,7 @@ error_report = {} doc_link = 'https://github.com/ABI-Software/scicrunch-knowledge-testing/tree/doc_v1' -#the following should either be a falsy value or a string containg dataset number +# The following should either be a falsy value or a string containing dataset number checkDatasetOnly = False s3 = boto3.client( @@ -31,9 +31,9 @@ THUMBNAIL_IMAGE = 'abi-thumbnail' NOT_SPECIFIED = 'not-specified' -MIMETYPE_WITH_THUMBNAILS = [ PLOT_FILE, SCAFFOLD_FILE, SCAFFOLD_VIEW_FILE] +MIMETYPE_WITH_THUMBNAILS = [PLOT_FILE, SCAFFOLD_FILE, SCAFFOLD_VIEW_FILE] -TEST_MIME_TYPES = { +TEST_MIME_TYPES = { 'application/x.vnd.abi.context-information+json': CONTEXT_FILE, 'application/x.vnd.abi.scaffold.meta+json': SCAFFOLD_FILE, 'application/x.vnd.abi.scaffold.view+json': SCAFFOLD_VIEW_FILE, @@ -50,7 +50,6 @@ def getDatasets(start, size): - headers = {'accept': 'application/json'} params = {'api_key': Config.SCICRUNCH_API_KEY} @@ -59,7 +58,7 @@ def getDatasets(start, size): scicrunch_request = { "from": start, "size": size, - #For checking specific dataset + # For checking specific dataset "_source": [ "item.curie", @@ -80,7 +79,7 @@ def getDatasets(start, size): "pennsieve.identifier.aggregate": { "query": checkDatasetOnly } - } + } } scicrunch_request["query"] = query @@ -105,7 +104,8 @@ def map_mime_type(mime_type): return NOT_SPECIFIED -#Get file header response from s3 bucket + +# Get file header response from s3 bucket def getFileResponse(localPath, path, mime_type, bucket): try: head_response = s3.head_object( @@ -115,7 +115,7 @@ def getFileResponse(localPath, path, mime_type, bucket): ) if head_response and 'ResponseMetadata' in head_response \ - and 200 == head_response['ResponseMetadata']['HTTPStatusCode']: + and 200 == head_response['ResponseMetadata']['HTTPStatusCode']: pass else: return { @@ -133,23 +133,25 @@ def getFileResponse(localPath, path, mime_type, bucket): } return None -#Get the mimetype + +# Get the mimetype def getObjectMimeType(obj): mime_type = obj.get('additional_mimetype', NOT_SPECIFIED) if mime_type != NOT_SPECIFIED: mime_type = mime_type.get('name') - return mime_type + return mime_type + -#Check if any of the item in IsSourceOf is a thumbnail for the object +# Check if any of the item in IsSourceOf is a thumbnail for the object def checkForThumbnail(obj, obj_list): local_mapped_type = map_mime_type(getObjectMimeType(obj)) if local_mapped_type == THUMBNAIL_IMAGE: - #Thumbnail found + # Thumbnail found return True elif local_mapped_type == SCAFFOLD_VIEW_FILE: if 'dataset' in obj and 'path' in obj['dataset']: localPath = obj['dataset']['path'] - #Found view file, check for thumbnail + # Found view file, check for thumbnail if 'datacite' in obj and 'IsSourceOf' in obj['datacite']: isSourceOf = obj['datacite']['IsSourceOf'] if 'relative' in isSourceOf and 'path' in isSourceOf['relative']: @@ -158,13 +160,14 @@ def checkForThumbnail(obj, obj_list): found = next((i for i, item in enumerate(obj_list) if item['dataset']['path'] == actualPath), None) if found and map_mime_type(getObjectMimeType(obj_list[found])): return True - + return False -#Generate report for datacite in the object + +# Generate report for datacite in the object def getDataciteReport(obj_list, obj, mapped_mimetype, filePath): - keysToCheck = { 'IsDerivedFrom': 0, 'IsSourceOf': 0} - reports = {'TotalErrors':0, 'ThumbnailError': 'None', 'ItemTested':0, 'IsDerivedFrom': [], 'IsSourceOf': [] } + keysToCheck = {'IsDerivedFrom': 0, 'IsSourceOf': 0} + reports = {'TotalErrors': 0, 'ThumbnailError': 'None', 'ItemTested': 0, 'IsDerivedFrom': [], 'IsSourceOf': []} thumbnailFound = False if 'datacite' in obj: @@ -178,7 +181,7 @@ def getDataciteReport(obj_list, obj, mapped_mimetype, filePath): try: actualPath = urllib.parse.urljoin(filePath, path) found = next((i for i, item in enumerate(obj_list) if item['dataset']['path'] == actualPath), None) - if found == None: + if found is None: reports[key].append( { 'RelativePath': path, @@ -186,9 +189,9 @@ def getDataciteReport(obj_list, obj, mapped_mimetype, filePath): 'ReasonDetails': doc_link + '#reason-cannot-find-the-path' } ) - reports['TotalErrors'] +=1 + reports['TotalErrors'] += 1 elif key == 'IsSourceOf': - #Check for thumbnail + # Check for thumbnail thumbnailFound = checkForThumbnail(obj_list[found], obj_list) except: reports[key].append( @@ -198,21 +201,22 @@ def getDataciteReport(obj_list, obj, mapped_mimetype, filePath): 'ReasonDetails': doc_link + '#reason-encounter-a-problem-while-looking-for-path' } ) - reports['TotalErrors'] +=1 + reports['TotalErrors'] += 1 if mapped_mimetype in MIMETYPE_WITH_THUMBNAILS: if keysToCheck['IsSourceOf'] == 0: reports['ThumbnailError'] = 'Missing IsSourceOf entry' reports['ThumbnailErrorDetails'] = doc_link + '#thumbnailerror-missing-issourceof-entry' - reports['TotalErrors'] +=1 - if thumbnailFound == False: + reports['TotalErrors'] += 1 + if not thumbnailFound: reports['ThumbnailError'] = 'Thumbnail not found in IsSourceOf' reports['ThumbnailErrorDetails'] = doc_link + '#thumbnailerror-thumbnail-not-found-in-issourceof' - reports['TotalErrors'] +=1 + reports['TotalErrors'] += 1 return reports -#Test object to check for any possible error + +# Test object to check for any possible error def testObj(obj_list, obj, mime_type, mapped_mime_type, prefix, bucket): dataciteReport = None fileResponse = None @@ -223,7 +227,7 @@ def testObj(obj_list, obj, mime_type, mapped_mime_type, prefix, bucket): fileResponse = getFileResponse(localPath, path, mime_type, bucket) dataciteReport = getDataciteReport(obj_list, obj, mapped_mime_type, localPath) if dataciteReport['TotalErrors'] > 0: - if fileResponse == None: + if fileResponse is None: fileResponse = { 'Mimetype': mime_type, 'Path': localPath, @@ -234,11 +238,12 @@ def testObj(obj_list, obj, mime_type, mapped_mime_type, prefix, bucket): 'Mimetype': mime_type, 'Path': 'Not found', 'Reason': 'Cannot find path', - 'Reason': doc_link + '#reason-cannot-find-the-path' + 'ReasonDetails': doc_link + '#reason-cannot-find-the-path' } - + return fileResponse + def test_obj_list(id, version, obj_list, scaffoldTag, bucket): objectErrors = [] prefix = f"{id}/files" @@ -248,7 +253,7 @@ def test_obj_list(id, version, obj_list, scaffoldTag, bucket): for obj in obj_list: mime_type = getObjectMimeType(obj) - mapped_mime_type = map_mime_type(mime_type) + mapped_mime_type = map_mime_type(mime_type) if mapped_mime_type == NOT_SPECIFIED: pass else: @@ -259,33 +264,33 @@ def test_obj_list(id, version, obj_list, scaffoldTag, bucket): error = testObj(obj_list, obj, mime_type, mapped_mime_type, prefix, bucket) if error: objectErrors.append(error) - - if foundScaffold == True: - if foundContextInfo == False: + + if foundScaffold: + if not foundContextInfo: datasetErrors.append({ 'Reason': 'Contextual Information cannot be found while scaffold is present', 'Details': doc_link + '#contextual-information-cannot-be-found-while-scaffold-is-present' }) - if scaffoldTag == False: + if not scaffoldTag: datasetErrors.append({ 'Reason': 'Scaffold found in objects list but the dataset is not tagged with scaffold (types.item.name)', 'Details': doc_link + '#scaffold-found-in-objects-list-but-the-dataset-is-not-tagged-with-scaffold-typesitemname' }) - elif scaffoldTag == True: + elif scaffoldTag: datasetErrors.append({ 'Reason': 'Dataset is tagged with scaffold (types.item.name) but no scaffold can be found in the list of objects.', 'Details': doc_link + '#dataset-is-tagged-with-scaffold-typesitemname-but-no-scaffold-can-be-found-in-the-list-of-objects' }) - numberOfErrors = len(objectErrors) fileReports = { 'Total': numberOfErrors, 'Objects': objectErrors } return {"FileReports": fileReports, "DatasetErrors": datasetErrors} - -#Test the dataset + + +# Test the dataset def test_datasets_information(dataset): scaffoldTag = False report = { @@ -293,9 +298,9 @@ def test_datasets_information(dataset): 'DOI': 'none', '_id': dataset['_id'], 'Errors': [], - 'ObjectErrors': {'Total': 0, 'Objects':[]} + 'ObjectErrors': {'Total': 0, 'Objects': []} } - if '_source' in dataset : + if '_source' in dataset: source = dataset['_source'] if 'item' in source: report['Name'] = source['item'].get('name', 'none') @@ -336,8 +341,7 @@ def test_files_information(self): keepGoing = True totalSize = 0 reportOutput = 'reports/error_reports.json' - reports = {'Tested': 0, 'Failed': 0, 'FailedIds':[], 'Datasets':[]} - + reports = {'Tested': 0, 'Failed': 0, 'FailedIds': [], 'Datasets': []} while keepGoing: scicrunch_response = getDatasets(start, size) @@ -345,11 +349,11 @@ def test_files_information(self): data = scicrunch_response.json() - #No more result, stop + # No more result, stop if size > len(data['hits']['hits']): keepGoing = False - #keepGoing= False + # keepGoing= False start = start + size @@ -360,7 +364,6 @@ def test_files_information(self): reports['FailedIds'].append(report['Id']) reports['Datasets'].append(report) - # Generate the report reports['Tested'] = totalSize print(f"Number of datasets tested: {reports['Tested']}") @@ -368,14 +371,15 @@ def test_files_information(self): print(f"Number of dataset with erros: {reports['Failed']}") if reports['Failed'] > 0: print(f"Failed Datasets: {reports['FailedIds']}") - + os.makedirs(os.path.dirname(reportOutput), exist_ok=True) with open(reportOutput, 'w') as outfile: json.dump(reports, outfile, indent=4) - + print(f"Full report has been generated at {reportOutput}") self.assertEqual(0, len(reports['FailedIds'])) + if __name__ == '__main__': unittest.main()