Skip to content

Commit

Permalink
ENH: improve error reporting for unrecognized items from the manifest
Browse files Browse the repository at this point in the history
Re ImagingDataCommons#100

Whenever a crdc_series_uuid from the manifest is not matched to those
known to the index, provide error message informing the user of what
could be the reasons. Fixed error checking for unrecognized items in
the validation function. Report unrecognized items independently of
whether validation is requested or not.
  • Loading branch information
fedorov committed Jul 29, 2024
1 parent b8bd83a commit 1cac461
Showing 1 changed file with 18 additions and 13 deletions.
31 changes: 18 additions & 13 deletions idc_index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,8 +574,8 @@ def _validate_update_manifest_and_get_download_size(
# create a copy of the index
index_df_copy = self.index

# Extract s3 url and crdc_instance_uuid from the manifest copy commands
# Next, extract crdc_instance_uuid from aws_series_url in the index and
# Extract s3 url and crdc_series_uuid from the manifest copy commands
# Next, extract crdc_series_uuid from aws_series_url in the index and
# try to verify if every series in the manifest is present in the index

# TODO: need to remove the assumption that manifest commands will have 'cp'
Expand Down Expand Up @@ -603,8 +603,9 @@ def _validate_update_manifest_and_get_download_size(
seriesInstanceuid,
s3_url,
series_size_MB,
index_crdc_series_uuid==manifest_crdc_series_uuid AS crdc_series_uuid_match,
index_crdc_series_uuid is not NULL as crdc_series_uuid_match,
s3_url==series_aws_url AS s3_url_match,
manifest_temp.manifest_cp_cmd,
CASE
WHEN s3_url==series_aws_url THEN 'aws'
ELSE
Expand All @@ -623,19 +624,23 @@ def _validate_update_manifest_and_get_download_size(

endpoint_to_use = None

if validate_manifest:
# Check if crdc_instance_uuid is found in the index
if not all(merged_df["crdc_series_uuid_match"]):
missing_manifest_cp_cmds = merged_df.loc[
~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd"
]
missing_manifest_cp_cmds_str = f"The following manifest copy commands do not have any associated series in the index: {missing_manifest_cp_cmds.tolist()}"
raise ValueError(missing_manifest_cp_cmds_str)
# Check if any crdc_series_uuid are not found in the index
if not merged_df["crdc_series_uuid_match"].all():
missing_manifest_cp_cmds = merged_df.loc[
~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd"
]
logger.error(
"The following manifest copy commands are not recognized as referencing any associated series in the index.\n"
"This means either these commands are invalid, or they may correspond to files available in a release of IDC\n"
f"prior to {self.get_idc_version()}. The corresponding files will not be downloaded.\n"
)
logger.error("\n" + "\n".join(missing_manifest_cp_cmds.tolist()))

# Check if there are more than one endpoints
if validate_manifest:
# Check if there is more than one endpoint
if len(merged_df["endpoint"].unique()) > 1:
raise ValueError(
"Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. If so, please use urls from one provider only"
"Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. "
)

if (
Expand Down

0 comments on commit 1cac461

Please sign in to comment.