From e45b8a7baff5ccbf75bce8c606b00a7644143641 Mon Sep 17 00:00:00 2001 From: Martin Malina Date: Fri, 13 Dec 2024 16:40:01 +0100 Subject: [PATCH] fix(RELEASE-1345): avoid failing on invalid purl string (#343) The sboms can contain invalid non-rpm purl strings and we would fail on parsing them using the packageurl module. Avoid these failures by parsing the purl type in our own function and only pass it to packageurl if it's an rpm purl. Otherwise skip it. Signed-off-by: Martin Malina --- pyxis/test_upload_rpm_data.py | 33 +++++++++++++++++++++++++ pyxis/test_upload_rpm_data_cyclonedx.py | 33 +++++++++++++++++++++++++ pyxis/upload_rpm_data.py | 33 +++++++++++++++++++++++-- pyxis/upload_rpm_data_cyclonedx.py | 33 +++++++++++++++++++++++-- 4 files changed, 128 insertions(+), 4 deletions(-) diff --git a/pyxis/test_upload_rpm_data.py b/pyxis/test_upload_rpm_data.py index 56c7aae..aba9bf6 100644 --- a/pyxis/test_upload_rpm_data.py +++ b/pyxis/test_upload_rpm_data.py @@ -10,6 +10,7 @@ update_container_content_sets, load_sbom_packages, construct_rpm_items_and_content_sets, + get_purl_type, ) GRAPHQL_API = "myapiurl" @@ -458,3 +459,35 @@ def test_construct_rpm_items_and_content_sets__no_packages_result_in_empty_list( assert rpms == [] assert content_sets == [] + + +def test_get_purl_type__rpm(): + purl = ( + "pkg:rpm/rhel/acl@2.3.1-4.el9?arch=x86_64&upstream=acl-2.3.1-4.el9.src.rpm" + "&distro=rhel-9.4&repository_id=myrepo3" + ) + + type = get_purl_type(purl) + + assert type == "rpm" + + +def test_get_purl_type__invalid_docker(): + """This is an invalid purl that packageurl.PackageURL.from_string() would fail on, + but we can still get the type successfully. + """ + purl = "pkg:github/docker:/#docker.mirror.hashicorp.services/rhysd/actionlint:latest" + + type = get_purl_type(purl) + + assert type == "github" + + +def test_get_purl_type__missing_type(): + """This is an invalid purl that does not have a type, so the function will throw + an exception. + """ + purl = "pkg:docker:#docker.mirror.hashicorp.services" + + with pytest.raises(ValueError): + get_purl_type(purl) diff --git a/pyxis/test_upload_rpm_data_cyclonedx.py b/pyxis/test_upload_rpm_data_cyclonedx.py index bdc4de4..c953a10 100644 --- a/pyxis/test_upload_rpm_data_cyclonedx.py +++ b/pyxis/test_upload_rpm_data_cyclonedx.py @@ -11,6 +11,7 @@ load_sbom_components, check_bom_ref_duplicates, construct_rpm_items_and_content_sets, + get_purl_type, ) GRAPHQL_API = "myapiurl" @@ -435,3 +436,35 @@ def test_construct_rpm_items_and_content_sets__no_components_result_in_empty_lis assert rpms == [] assert content_sets == [] + + +def test_get_purl_type__rpm(): + purl = ( + "pkg:rpm/rhel/acl@2.3.1-4.el9?arch=x86_64&upstream=acl-2.3.1-4.el9.src.rpm" + "&distro=rhel-9.4&repository_id=myrepo3" + ) + + type = get_purl_type(purl) + + assert type == "rpm" + + +def test_get_purl_type__invalid_docker(): + """This is an invalid purl that packageurl.PackageURL.from_string() would fail on, + but we can still get the type successfully. + """ + purl = "pkg:github/docker:/#docker.mirror.hashicorp.services/rhysd/actionlint:latest" + + type = get_purl_type(purl) + + assert type == "github" + + +def test_get_purl_type__missing_type(): + """This is an invalid purl that does not have a type, so the function will throw + an exception. + """ + purl = "pkg:docker:#docker.mirror.hashicorp.services" + + with pytest.raises(ValueError): + get_purl_type(purl) diff --git a/pyxis/upload_rpm_data.py b/pyxis/upload_rpm_data.py index 6a4e216..e246163 100755 --- a/pyxis/upload_rpm_data.py +++ b/pyxis/upload_rpm_data.py @@ -235,9 +235,10 @@ def construct_rpm_items_and_content_sets( for externalRef in package.get("externalRefs", []): if externalRef.get("referenceType") != "purl": continue - purl_dict = PackageURL.from_string(externalRef["referenceLocator"]).to_dict() - if purl_dict["type"] != "rpm": + type = get_purl_type(externalRef["referenceLocator"]) + if type != "rpm": continue + purl_dict = PackageURL.from_string(externalRef["referenceLocator"]).to_dict() if purl_dict["name"] in IGNORED_PACKAGES: continue rpm_item = { @@ -271,6 +272,34 @@ def construct_rpm_items_and_content_sets( return rpms_items, sorted(content_sets) +def get_purl_type(purl: str): + """ + Return purl type parsed from a purl string. + + Copied and adapted from packageurl package. The reason we need this function + and cannot simply use the type component of + packageurl.PackageURL.from_string(purl) is that there can be invalid non-rpm + purls generated by syft. By getting just the type first and skipping those + purls, we avoid failing on those invalid purls. + + Raise ValueError on errors. + """ + scheme, sep, remainder = purl.partition(":") + if not sep or scheme != "pkg": + raise ValueError(f'purl is missing the required "pkg" scheme component: {repr(purl)}.') + + # this strip '/, // and /// as possible in :// or :/// + remainder = remainder.strip().lstrip("/") + + type, sep, remainder = remainder.partition("/") # NOQA + if not type or not sep: + raise ValueError(f"purl is missing the required type component: {repr(purl)}.") + + type = type.lower() + + return type + + def main(): # pragma: no cover """Main func""" args = parse_arguments() diff --git a/pyxis/upload_rpm_data_cyclonedx.py b/pyxis/upload_rpm_data_cyclonedx.py index 5c33fc0..9edca68 100755 --- a/pyxis/upload_rpm_data_cyclonedx.py +++ b/pyxis/upload_rpm_data_cyclonedx.py @@ -257,8 +257,9 @@ def construct_rpm_items_and_content_sets( content_sets = set() for component in components: if "purl" in component: - purl_dict = PackageURL.from_string(component["purl"]).to_dict() - if purl_dict["type"] == "rpm": + type = get_purl_type(component["purl"]) + if type == "rpm": + purl_dict = PackageURL.from_string(component["purl"]).to_dict() if purl_dict["name"] in IGNORED_PACKAGES: continue rpm_item = { @@ -292,6 +293,34 @@ def construct_rpm_items_and_content_sets( return rpms_items, sorted(content_sets) +def get_purl_type(purl: str): + """ + Return purl type parsed from a purl string. + + Copied and adapted from packageurl package. The reason we need this function + and cannot simply use the type component of + packageurl.PackageURL.from_string(purl) is that there can be invalid non-rpm + purls generated by syft. By getting just the type first and skipping those + purls, we avoid failing on those invalid purls. + + Raise ValueError on errors. + """ + scheme, sep, remainder = purl.partition(":") + if not sep or scheme != "pkg": + raise ValueError(f'purl is missing the required "pkg" scheme component: {repr(purl)}.') + + # this strip '/, // and /// as possible in :// or :/// + remainder = remainder.strip().lstrip("/") + + type, sep, remainder = remainder.partition("/") # NOQA + if not type or not sep: + raise ValueError(f"purl is missing the required type component: {repr(purl)}.") + + type = type.lower() + + return type + + def main(): # pragma: no cover """Main func""" args = parse_arguments()