diff --git a/idc_index/index.py b/idc_index/index.py index 229524a7..d51d4dc0 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -574,8 +574,8 @@ def _validate_update_manifest_and_get_download_size( # create a copy of the index index_df_copy = self.index - # Extract s3 url and crdc_instance_uuid from the manifest copy commands - # Next, extract crdc_instance_uuid from aws_series_url in the index and + # Extract s3 url and crdc_series_uuid from the manifest copy commands + # Next, extract crdc_series_uuid from aws_series_url in the index and # try to verify if every series in the manifest is present in the index # TODO: need to remove the assumption that manifest commands will have 'cp' @@ -603,8 +603,9 @@ def _validate_update_manifest_and_get_download_size( seriesInstanceuid, s3_url, series_size_MB, - index_crdc_series_uuid==manifest_crdc_series_uuid AS crdc_series_uuid_match, + index_crdc_series_uuid is not NULL as crdc_series_uuid_match, s3_url==series_aws_url AS s3_url_match, + manifest_temp.manifest_cp_cmd, CASE WHEN s3_url==series_aws_url THEN 'aws' ELSE @@ -623,19 +624,23 @@ def _validate_update_manifest_and_get_download_size( endpoint_to_use = None - if validate_manifest: - # Check if crdc_instance_uuid is found in the index - if not all(merged_df["crdc_series_uuid_match"]): - missing_manifest_cp_cmds = merged_df.loc[ - ~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd" - ] - missing_manifest_cp_cmds_str = f"The following manifest copy commands do not have any associated series in the index: {missing_manifest_cp_cmds.tolist()}" - raise ValueError(missing_manifest_cp_cmds_str) + # Check if any crdc_series_uuid are not found in the index + if not all(merged_df["crdc_series_uuid_match"]): + missing_manifest_cp_cmds = merged_df.loc[ + ~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd" + ] + logger.error( + "The following manifest copy commands are not recognized as referencing any associated series in the index.\n" + "This means either these commands are invalid, or they may correspond to files available in a release of IDC\n" + f"different from {self.get_idc_version()} used in this version of idc-index. The corresponding files will not be downloaded.\n" + ) + logger.error("\n" + "\n".join(missing_manifest_cp_cmds.tolist())) - # Check if there are more than one endpoints + if validate_manifest: + # Check if there is more than one endpoint if len(merged_df["endpoint"].unique()) > 1: raise ValueError( - "Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. If so, please use urls from one provider only" + "Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. " ) if ( diff --git a/tests/idcindex.py b/tests/idcindex.py index e0a252ad..7e6c4a4b 100644 --- a/tests/idcindex.py +++ b/tests/idcindex.py @@ -461,6 +461,28 @@ def test_cli_download(self): ) assert len(os.listdir(Path.cwd())) != 0 + def test_prior_version_manifest(self): + c = IDCClient() + with tempfile.TemporaryDirectory() as temp_dir: + ( + total_size, + endpoint_to_use, + temp_manifest_file, + list_of_directories, + ) = c._validate_update_manifest_and_get_download_size( + "./prior_version_manifest.s5cmd", + temp_dir, + True, + False, + IDCClient.DOWNLOAD_HIERARCHY_DEFAULT, + ) + + # TODO: once issue https://github.com/ImagingDataCommons/idc-index/issues/100 + # is fully resolved, the manifest below should not be empty, and this test should be updated + # with count equal to 5 + with open(temp_manifest_file) as file: + assert len(file.readlines()) == 0 + if __name__ == "__main__": unittest.main() diff --git a/tests/prior_version_manifest.s5cmd b/tests/prior_version_manifest.s5cmd new file mode 100644 index 00000000..1c91a450 --- /dev/null +++ b/tests/prior_version_manifest.s5cmd @@ -0,0 +1,5 @@ +cp s3://idc-open-data/040fd3e1-0088-4bfd-8439-55e3c5d80a56/* . +cp s3://idc-open-data/04553d0f-1af9-414d-b631-cc31624aced5/* . +cp s3://idc-open-data/068346bf-16ef-4e45-87bf-87feb576a21c/* . +cp s3://idc-open-data/07908d47-5e85-45f3-9649-79c15f606f52/* . +cp s3://idc-open-data/099d180f-1d79-402d-abad-bfd8e2736b04/* .