From 1cac46181229dcf7bb5dede10be54cb3e410461c Mon Sep 17 00:00:00 2001 From: Andrey Fedorov Date: Mon, 29 Jul 2024 19:02:01 -0400 Subject: [PATCH 1/2] ENH: improve error reporting for unrecognized items from the manifest Re #100 Whenever a crdc_series_uuid from the manifest is not matched to those known to the index, provide error message informing the user of what could be the reasons. Fixed error checking for unrecognized items in the validation function. Report unrecognized items independently of whether validation is requested or not. --- idc_index/index.py | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index 229524a7..47c5eaa9 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -574,8 +574,8 @@ def _validate_update_manifest_and_get_download_size( # create a copy of the index index_df_copy = self.index - # Extract s3 url and crdc_instance_uuid from the manifest copy commands - # Next, extract crdc_instance_uuid from aws_series_url in the index and + # Extract s3 url and crdc_series_uuid from the manifest copy commands + # Next, extract crdc_series_uuid from aws_series_url in the index and # try to verify if every series in the manifest is present in the index # TODO: need to remove the assumption that manifest commands will have 'cp' @@ -603,8 +603,9 @@ def _validate_update_manifest_and_get_download_size( seriesInstanceuid, s3_url, series_size_MB, - index_crdc_series_uuid==manifest_crdc_series_uuid AS crdc_series_uuid_match, + index_crdc_series_uuid is not NULL as crdc_series_uuid_match, s3_url==series_aws_url AS s3_url_match, + manifest_temp.manifest_cp_cmd, CASE WHEN s3_url==series_aws_url THEN 'aws' ELSE @@ -623,19 +624,23 @@ def _validate_update_manifest_and_get_download_size( endpoint_to_use = None - if validate_manifest: - # Check if crdc_instance_uuid is found in the index - if not all(merged_df["crdc_series_uuid_match"]): - missing_manifest_cp_cmds = merged_df.loc[ - ~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd" - ] - missing_manifest_cp_cmds_str = f"The following manifest copy commands do not have any associated series in the index: {missing_manifest_cp_cmds.tolist()}" - raise ValueError(missing_manifest_cp_cmds_str) + # Check if any crdc_series_uuid are not found in the index + if not merged_df["crdc_series_uuid_match"].all(): + missing_manifest_cp_cmds = merged_df.loc[ + ~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd" + ] + logger.error( + "The following manifest copy commands are not recognized as referencing any associated series in the index.\n" + "This means either these commands are invalid, or they may correspond to files available in a release of IDC\n" + f"prior to {self.get_idc_version()}. The corresponding files will not be downloaded.\n" + ) + logger.error("\n" + "\n".join(missing_manifest_cp_cmds.tolist())) - # Check if there are more than one endpoints + if validate_manifest: + # Check if there is more than one endpoint if len(merged_df["endpoint"].unique()) > 1: raise ValueError( - "Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. If so, please use urls from one provider only" + "Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. " ) if ( From 7a516d878e24a935fc230017f0134ff2c46801d5 Mon Sep 17 00:00:00 2001 From: Andrey Fedorov Date: Tue, 30 Jul 2024 10:27:49 -0400 Subject: [PATCH 2/2] ENH: add test of the manifest that has mismatches --- idc_index/index.py | 4 ++-- tests/idcindex.py | 22 ++++++++++++++++++++++ tests/prior_version_manifest.s5cmd | 5 +++++ 3 files changed, 29 insertions(+), 2 deletions(-) create mode 100644 tests/prior_version_manifest.s5cmd diff --git a/idc_index/index.py b/idc_index/index.py index 47c5eaa9..d51d4dc0 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -625,14 +625,14 @@ def _validate_update_manifest_and_get_download_size( endpoint_to_use = None # Check if any crdc_series_uuid are not found in the index - if not merged_df["crdc_series_uuid_match"].all(): + if not all(merged_df["crdc_series_uuid_match"]): missing_manifest_cp_cmds = merged_df.loc[ ~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd" ] logger.error( "The following manifest copy commands are not recognized as referencing any associated series in the index.\n" "This means either these commands are invalid, or they may correspond to files available in a release of IDC\n" - f"prior to {self.get_idc_version()}. The corresponding files will not be downloaded.\n" + f"different from {self.get_idc_version()} used in this version of idc-index. The corresponding files will not be downloaded.\n" ) logger.error("\n" + "\n".join(missing_manifest_cp_cmds.tolist())) diff --git a/tests/idcindex.py b/tests/idcindex.py index e0a252ad..7e6c4a4b 100644 --- a/tests/idcindex.py +++ b/tests/idcindex.py @@ -461,6 +461,28 @@ def test_cli_download(self): ) assert len(os.listdir(Path.cwd())) != 0 + def test_prior_version_manifest(self): + c = IDCClient() + with tempfile.TemporaryDirectory() as temp_dir: + ( + total_size, + endpoint_to_use, + temp_manifest_file, + list_of_directories, + ) = c._validate_update_manifest_and_get_download_size( + "./prior_version_manifest.s5cmd", + temp_dir, + True, + False, + IDCClient.DOWNLOAD_HIERARCHY_DEFAULT, + ) + + # TODO: once issue https://github.com/ImagingDataCommons/idc-index/issues/100 + # is fully resolved, the manifest below should not be empty, and this test should be updated + # with count equal to 5 + with open(temp_manifest_file) as file: + assert len(file.readlines()) == 0 + if __name__ == "__main__": unittest.main() diff --git a/tests/prior_version_manifest.s5cmd b/tests/prior_version_manifest.s5cmd new file mode 100644 index 00000000..1c91a450 --- /dev/null +++ b/tests/prior_version_manifest.s5cmd @@ -0,0 +1,5 @@ +cp s3://idc-open-data/040fd3e1-0088-4bfd-8439-55e3c5d80a56/* . +cp s3://idc-open-data/04553d0f-1af9-414d-b631-cc31624aced5/* . +cp s3://idc-open-data/068346bf-16ef-4e45-87bf-87feb576a21c/* . +cp s3://idc-open-data/07908d47-5e85-45f3-9649-79c15f606f52/* . +cp s3://idc-open-data/099d180f-1d79-402d-abad-bfd8e2736b04/* .