From 4ed00cc99a5d8bf407dd8008df7337f485c1ea96 Mon Sep 17 00:00:00 2001 From: Jono Yang Date: Thu, 2 Nov 2023 18:09:52 -0700 Subject: [PATCH] Look for purl info using sha1 * Do this when we cannot find a package download url using existing purl values Signed-off-by: Jono Yang --- .../commands/update_maven_download_urls.py | 205 ++++++++++++++++-- 1 file changed, 192 insertions(+), 13 deletions(-) diff --git a/packagedb/management/commands/update_maven_download_urls.py b/packagedb/management/commands/update_maven_download_urls.py index b294b15c..bcaec888 100644 --- a/packagedb/management/commands/update_maven_download_urls.py +++ b/packagedb/management/commands/update_maven_download_urls.py @@ -7,6 +7,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +from dateutil.parser import parse as dateutil_parse import copy import logging import sys @@ -18,6 +19,9 @@ from requests.adapters import HTTPAdapter import requests +from minecode.visitors.maven import collect_links_from_text +from minecode.visitors.maven import filter_for_artifacts +from packagedcode.maven import get_urls, build_filename from minecode.management.commands import VerboseCommand from packagedb.models import Package from packagedcode.maven import get_urls @@ -32,9 +36,84 @@ logger.setLevel(logging.INFO) session = Session() +session.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36', +} session.mount('https://', HTTPAdapter(max_retries=Retry(10))) +def get_timestamps_by_links(package_version_page_url): + timestamps_by_links = {} + response = requests.get(package_version_page_url) + if response: + timestamps_by_links = collect_links_from_text(response.text, filter=filter_for_artifacts) + timestamps_by_links = { + link: dateutil_parse(timestamp) for link, timestamp in timestamps_by_links.items() + } + return timestamps_by_links + + +class MavenArtifact(object): + def __init__(self, namespace, name, version, qualifiers='', ec=[]): + type = 'maven' + self.type = type + self.namespace = namespace + self.name = name + self.version = version + self.qualifiers = qualifiers + self.package_url = PackageURL( + type=type, + namespace=namespace, + name=name, + version=version, + qualifiers=qualifiers + ) + urls = get_urls( + namespace=namespace, + name=name, + version=version, + qualifiers=self.package_url.qualifiers, + ) + self.download_url = urls['repository_download_url'] + self.repository_homepage_url = urls['repository_homepage_url'] + self.api_data_url = urls['api_data_url'] + + qualifiers_mapping = self.package_url.qualifiers + filename = build_filename( + artifact_id=name, + version=version, + extension=qualifiers_mapping.get('type') or 'jar', + classifier=qualifiers_mapping.get('classifier'), + ) + timestamps_by_links = get_timestamps_by_links(self.repository_homepage_url) + self.release_date = timestamps_by_links.get(filename) + self.related_artifacts = list( + self._populate_related_artifacts( + namespace=namespace, + name=name, + version=version, + ec=ec, + ) + ) + + @classmethod + def _populate_related_artifacts(cls, namespace, name, version, ec): + filtered_ec = [entry for entry in ec if not entry.startswith('.')] + for entry in filtered_ec: + _, ending = entry.split('-') + split_ending = ending.split('.') + classifier = None + if len(split_ending) > 0: + classifier = split_ending[0] + qualifiers = f'classifier={classifier}' + yield cls( + namespace=namespace, + name=name, + version=version, + qualifiers=qualifiers, + ) + + # This is from https://stackoverflow.com/questions/4856882/limiting-memory-use-in-a-large-django-queryset/5188179#5188179 class MemorySavingQuerysetIterator(object): def __init__(self,queryset,max_obj_num=1000): @@ -72,6 +151,38 @@ def check_download_url(url, timeout=DEFAULT_TIMEOUT): return False +def query_sha1_on_maven(sha1, timeout=DEFAULT_TIMEOUT): + maven_api_search_url = f'https://search.maven.org/solrsearch/select?q=1:{sha1}' + try: + response = session.get(maven_api_search_url, timeout=timeout) + response.raise_for_status() + except (requests.RequestException, ValueError, TypeError) as exception: + logger.debug(f"[Exception] {exception}") + return False + if not response.ok: + return f"API query failed for: {maven_api_search_url}" + contents = response.json() + resp = contents.get('response', {}) + matched_artifacts = [] + if resp.get('numFound', 0) > 0: + for matched_artifact in resp.get('docs', []): + namespace = matched_artifact.get('g', '') + name = matched_artifact.get('a', '') + version = matched_artifact.get('v', '') + ec = matched_artifact.get('ec', []) + if not namespace and name and version: + continue + matched_artifacts.append( + MavenArtifact( + namespace=namespace, + name=name, + version=version, + ec=ec, + ) + ) + return matched_artifacts + + class Command(VerboseCommand): help = 'Update maven Package download_url values' @@ -81,20 +192,39 @@ def handle(self, *args, **options): logger.info(f'Checking {maven_packages_count:,} Maven Package download URLs') packages_to_delete = [] unsaved_packages = [] + unsaved_packages_from_sha1_lookup = [] processed_packages_count = 0 for i, package in enumerate(MemorySavingQuerysetIterator(maven_packages)): if not i % 1000: logger.info(f'Checked {i:,} / {maven_packages_count:,} Maven Package download URLs') - if not i % 2000 and unsaved_packages: - with transaction.atomic(): - Package.objects.bulk_update( - objs=unsaved_packages, - fields=[ - 'download_url', - ] - ) - processed_packages_count += unsaved_packages.count() - unsaved_packages = [] + if not i % 2000: + if unsaved_packages: + with transaction.atomic(): + Package.objects.bulk_update( + objs=unsaved_packages, + fields=[ + 'download_url', + ] + ) + processed_packages_count += unsaved_packages.count() + unsaved_packages = [] + if unsaved_packages_from_sha1_lookup: + with transaction.atomic(): + Package.objects.bulk_update( + objs=unsaved_packages_from_sha1_lookup, + fields=[ + 'namespace', + 'name', + 'version', + 'download_url', + 'release_date', + 'repository_homepage_url', + 'repository_download_url', + 'api_data_url', + ] + ) + processed_packages_count += unsaved_packages_from_sha1_lookup.count() + unsaved_packages_from_sha1_lookup = [] logger.info(f'Updated {processed_packages_count:,} Maven Packages') # If the package's download URL is not valid, then we update it if not check_download_url(package.download_url): @@ -115,12 +245,42 @@ def handle(self, *args, **options): if Package.objects.filter(download_url=generated_download_url).exists(): # This download url already exists in the database, we should just remove this record. packages_to_delete.append(package) + logger.info(f'Deleting {package.package_uid} - already exists in database') elif check_download_url(generated_download_url): package.download_url = generated_download_url unsaved_packages.append(package) logger.info(f'Updated download_url for {package.package_uid}') else: - logger.info(f'Error: cannot generate a valid download_url for package {package.package_uid}') + # fix purl values + # look up package sha1 on maven + matched_artifacts = query_sha1_on_maven(package.sha1) + if not matched_artifacts: + packages_to_delete.append(package) + logger.info(f'Deleting {package.package_uid} - does not exist on Maven') + for artifact in matched_artifacts: + if ( + package.namespace.lower() == artifact.namespace.lower() + and package.name.lower() == artifact.name.lower() + ): + if Package.objects.filter(download_url=artifact.download_url).exists(): + packages_to_delete.append(package) + logger.info(f'Deleting {package.package_uid} - already exists in database') + else: + package.namespace = artifact.namespace + package.name = artifact.name + package.version = artifact.version + package.download_url = artifact.download_url + package.release_date = artifact.release_date + package.repository_homepage_url = artifact.repository_homepage_url + package.repository_download_url = artifact.download_url + package.api_data_url = artifact.api_data_url + unsaved_packages_from_sha1_lookup.append(package) + processed_packages_count += 1 + logger.info( + f'Updated version for {package.package_uid}:\n' + f'\tversion: {package.version}' + ) + break if unsaved_packages: with transaction.atomic(): @@ -132,10 +292,29 @@ def handle(self, *args, **options): ) processed_packages_count += unsaved_packages.count() unsaved_packages = [] - logger.info(f'Updated {processed_packages_count:,} Maven Packages') + + if unsaved_packages_from_sha1_lookup: + with transaction.atomic(): + Package.objects.bulk_update( + objs=unsaved_packages_from_sha1_lookup, + fields=[ + 'namespace', + 'name', + 'version', + 'download_url', + 'release_date', + 'repository_homepage_url', + 'repository_download_url', + 'api_data_url', + ] + ) + processed_packages_count += unsaved_packages_from_sha1_lookup.count() + unsaved_packages_from_sha1_lookup = [] + + logger.info(f'Updated {processed_packages_count:,} Maven Packages') if packages_to_delete: pks = [p.pk for p in packages_to_delete] with transaction.atomic(): Package.objects.filter(pk__in=pks).delete() - logger.info(f'Deleted {pks.count():,} Maven Package duplicates') + logger.info(f'Deleted {pks.count():,} duplicate/invalid Maven Packages')