Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: improve error reporting for unrecognized items from the manifest #105

Merged
merged 2 commits into from
Jul 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 18 additions & 13 deletions idc_index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,8 +574,8 @@ def _validate_update_manifest_and_get_download_size(
# create a copy of the index
index_df_copy = self.index

# Extract s3 url and crdc_instance_uuid from the manifest copy commands
# Next, extract crdc_instance_uuid from aws_series_url in the index and
# Extract s3 url and crdc_series_uuid from the manifest copy commands
# Next, extract crdc_series_uuid from aws_series_url in the index and
# try to verify if every series in the manifest is present in the index

# TODO: need to remove the assumption that manifest commands will have 'cp'
Expand Down Expand Up @@ -603,8 +603,9 @@ def _validate_update_manifest_and_get_download_size(
seriesInstanceuid,
s3_url,
series_size_MB,
index_crdc_series_uuid==manifest_crdc_series_uuid AS crdc_series_uuid_match,
index_crdc_series_uuid is not NULL as crdc_series_uuid_match,
vkt1414 marked this conversation as resolved.
Show resolved Hide resolved
s3_url==series_aws_url AS s3_url_match,
manifest_temp.manifest_cp_cmd,
CASE
WHEN s3_url==series_aws_url THEN 'aws'
ELSE
Expand All @@ -623,19 +624,23 @@ def _validate_update_manifest_and_get_download_size(

endpoint_to_use = None

if validate_manifest:
# Check if crdc_instance_uuid is found in the index
if not all(merged_df["crdc_series_uuid_match"]):
missing_manifest_cp_cmds = merged_df.loc[
~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd"
]
missing_manifest_cp_cmds_str = f"The following manifest copy commands do not have any associated series in the index: {missing_manifest_cp_cmds.tolist()}"
raise ValueError(missing_manifest_cp_cmds_str)
# Check if any crdc_series_uuid are not found in the index
if not all(merged_df["crdc_series_uuid_match"]):
missing_manifest_cp_cmds = merged_df.loc[
~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd"
]
logger.error(
"The following manifest copy commands are not recognized as referencing any associated series in the index.\n"
"This means either these commands are invalid, or they may correspond to files available in a release of IDC\n"
f"different from {self.get_idc_version()} used in this version of idc-index. The corresponding files will not be downloaded.\n"
)
logger.error("\n" + "\n".join(missing_manifest_cp_cmds.tolist()))

# Check if there are more than one endpoints
if validate_manifest:
# Check if there is more than one endpoint
if len(merged_df["endpoint"].unique()) > 1:
raise ValueError(
"Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. If so, please use urls from one provider only"
"Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. "
)

if (
Expand Down
22 changes: 22 additions & 0 deletions tests/idcindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,28 @@ def test_cli_download(self):
)
assert len(os.listdir(Path.cwd())) != 0

def test_prior_version_manifest(self):
c = IDCClient()
with tempfile.TemporaryDirectory() as temp_dir:
(
total_size,
endpoint_to_use,
temp_manifest_file,
list_of_directories,
) = c._validate_update_manifest_and_get_download_size(
"./prior_version_manifest.s5cmd",
temp_dir,
True,
False,
IDCClient.DOWNLOAD_HIERARCHY_DEFAULT,
)

# TODO: once issue https://github.com/ImagingDataCommons/idc-index/issues/100
# is fully resolved, the manifest below should not be empty, and this test should be updated
# with count equal to 5
with open(temp_manifest_file) as file:
assert len(file.readlines()) == 0


if __name__ == "__main__":
unittest.main()
5 changes: 5 additions & 0 deletions tests/prior_version_manifest.s5cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
cp s3://idc-open-data/040fd3e1-0088-4bfd-8439-55e3c5d80a56/* .
cp s3://idc-open-data/04553d0f-1af9-414d-b631-cc31624aced5/* .
cp s3://idc-open-data/068346bf-16ef-4e45-87bf-87feb576a21c/* .
cp s3://idc-open-data/07908d47-5e85-45f3-9649-79c15f606f52/* .
cp s3://idc-open-data/099d180f-1d79-402d-abad-bfd8e2736b04/* .
Loading