diff --git a/matchcode/api.py b/matchcode/api.py index 092933e3..68844b8d 100644 --- a/matchcode/api.py +++ b/matchcode/api.py @@ -15,6 +15,7 @@ from rest_framework.decorators import action from rest_framework.response import Response from rest_framework.serializers import CharField +from rest_framework.serializers import FloatField from rest_framework.serializers import HyperlinkedRelatedField from rest_framework.serializers import ModelSerializer from rest_framework.serializers import ReadOnlyField @@ -24,6 +25,7 @@ from matchcode_toolkit.fingerprinting import create_halohash_chunks from matchcode_toolkit.fingerprinting import hexstring_to_binarray from matchcode_toolkit.fingerprinting import split_fingerprint +from matchcode_toolkit.halohash import byte_hamming_distance from matchcode.models import ExactFileIndex from matchcode.models import ExactPackageArchiveIndex from matchcode.models import ApproximateDirectoryContentIndex @@ -91,6 +93,7 @@ class BaseDirectoryIndexMatchSerializer(Serializer): lookup_field='uuid', read_only=True ) + similarity_score = FloatField() class CharMultipleWidget(widgets.TextInput): @@ -271,11 +274,18 @@ def match(self, request): for fingerprint in unique_fingerprints: matches = model_class.match(fingerprint) for match in matches: + _, bah128 = split_fingerprint(fingerprint) + # Get fingerprint from the match + fp = match.fingerprint() + _, match_bah128 = split_fingerprint(fp) + hd = byte_hamming_distance(bah128, match_bah128) + similarity_score = (128 - hd) / 128 results.append( { 'fingerprint': fingerprint, - 'matched_fingerprint': match.fingerprint(), + 'matched_fingerprint': fp, 'package': match.package, + 'similarity_score': similarity_score, } ) diff --git a/matchcode/tests/test_api.py b/matchcode/tests/test_api.py index be971081..8decc568 100644 --- a/matchcode/tests/test_api.py +++ b/matchcode/tests/test_api.py @@ -117,6 +117,7 @@ def test_api_approximate_directory_content_index_match_close_match(self): self.assertEqual(expected_matched_fingerprint, result['matched_fingerprint']) expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package1.uuid]) self.assertEqual(expected_package, result['package']) + self.assertEqual(0.9453125, result['similarity_score']) def test_api_approximate_directory_structure_index_match_close_match(self): # This test fingerprint has a hamming distance of 7 from the expected fingerprint @@ -133,6 +134,7 @@ def test_api_approximate_directory_structure_index_match_close_match(self): self.assertEqual(expected_matched_fingerprint, result['matched_fingerprint']) expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package2.uuid]) self.assertEqual(expected_package, result['package']) + self.assertEqual(0.9453125, result['similarity_score']) def test_api_approximate_directory_content_index_match(self): test_fingerprint = '00000007af7d63765c78fa516b5353f5ffa7df45' @@ -147,6 +149,7 @@ def test_api_approximate_directory_content_index_match(self): self.assertEqual(test_fingerprint, result['matched_fingerprint']) expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package1.uuid]) self.assertEqual(expected_package, result['package']) + self.assertEqual(1.0, result['similarity_score']) def test_api_approximate_directory_structure_index_match(self): test_fingerprint = '00000004d10982208810240820080a6a3e852486' @@ -161,3 +164,4 @@ def test_api_approximate_directory_structure_index_match(self): self.assertEqual(test_fingerprint, result['matched_fingerprint']) expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package2.uuid]) self.assertEqual(expected_package, result['package']) + self.assertEqual(1.0, result['similarity_score']) diff --git a/minecode/management/commands/get_maven_release_dates.py b/minecode/management/commands/get_maven_release_dates.py new file mode 100644 index 00000000..c120b67e --- /dev/null +++ b/minecode/management/commands/get_maven_release_dates.py @@ -0,0 +1,75 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from dateutil.parser import parse as dateutil_parse +from os.path import dirname +import logging +import sys + +import requests + +from minecode.management.commands import VerboseCommand +from minecode.visitors.maven import collect_links_from_text +from minecode.visitors.maven import filter_for_artifacts +from packagedb.models import Package + + +logger = logging.getLogger(__name__) +logging.basicConfig(stream=sys.stdout) +logger.setLevel(logging.INFO) + +TRACE = False +if TRACE: + logger.setLevel(logging.DEBUG) + + +class Command(VerboseCommand): + help = 'Get and set release_date for Maven Packages' + + def handle(self, *args, **options): + queryset = Package.objects.filter( + type='maven', + release_date=None, + download_url__startswith='https://repo1.maven.org/maven2' + ) + object_count = queryset.count() + chunk_size = 2000 + iterator = queryset.iterator(chunk_size=chunk_size) + unsaved_objects = [] + + logger.info(f'Updating release_date for {object_count} packages') + for index, package in enumerate(iterator, start=1): + download_url = package.download_url + package_url = package.package_url + logger.info(f'Updating release_date for package {package_url} ({download_url})') + package_version_page_url = dirname(download_url) + filename = download_url.rsplit('/')[-1] + response = requests.get(package_version_page_url) + if response: + timestamps_by_links = collect_links_from_text(response.text, filter=filter_for_artifacts) + timestamp = timestamps_by_links.get(filename) + if not timestamp: + logger.info(f'\tCould not get release_date for package {package_url} ({download_url})') + continue + timestamp = dateutil_parse(timestamp) + package.release_date = timestamp + unsaved_objects.append(package) + logger.info(f'\t{package_url} ({download_url}) release_date has been updated to {timestamp}') + else: + logger.info(f'\t{package_url} not updated: error encountered when visiting {package_version_page_url}') + if not (index % chunk_size) and unsaved_objects: + logger.info(f'{index:,} / {object_count:,} Packages processed') + + logger.info('Updating Package objects...') + updated_packages_count = Package.objects.bulk_update( + objs=unsaved_objects, + fields=['release_date'], + batch_size=1000, + ) + logger.info(f'Updated {updated_packages_count} Package objects') diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py new file mode 100644 index 00000000..55862921 --- /dev/null +++ b/minecode/management/commands/import_queue.py @@ -0,0 +1,167 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from dateutil.parser import parse as dateutil_parse +import logging +import signal +import sys +import time + +import requests + +from django.db import transaction +from django.utils import timezone +from packageurl import PackageURL + +from minecode.management.commands import get_error_message +from minecode.management.commands import VerboseCommand +from minecode.models import ImportableURI +from minecode.visitors.maven import get_artifact_links +from minecode.visitors.maven import get_classifier_from_artifact_url +from minecode.visitors.maven import collect_links_from_text +from minecode.visitors.maven import filter_only_directories +from minecode.visitors.maven import get_artifact_sha1 +from minecode.model_utils import merge_or_create_package +from packagedcode.models import PackageData +from packagedb.models import Package +from minecode.visitors.maven import determine_namespace_name_version_from_url + + +logger = logging.getLogger(__name__) +logging.basicConfig(stream=sys.stdout) +logger.setLevel(logging.INFO) + +TRACE = False +if TRACE: + logger.setLevel(logging.DEBUG) + +# sleep duration in seconds when the queue is empty +SLEEP_WHEN_EMPTY = 10 + +MUST_STOP = False + + +def stop_handler(*args, **kwargs): + """ + Signal handler to set global variable to True. + """ + global MUST_STOP + MUST_STOP = True + + +signal.signal(signal.SIGTERM, stop_handler) + + +class Command(VerboseCommand): + help = 'Run a Package request queue.' + + def handle(self, *args, **options): + """ + Get the next processable PriorityResourceURI and start the + processing. Loops forever and sleeps a short while if there are + no PriorityResourceURI left to process. + """ + + global MUST_STOP + + sleeping = False + processed_counter = 0 + + while True: + if MUST_STOP: + logger.info('Graceful exit of the request queue.') + break + + with transaction.atomic(): + importable_uri = ImportableURI.objects.get_next_request() + + if not importable_uri: + # Only log a single message when we go to sleep + if not sleeping: + sleeping = True + logger.info('No more processable request, sleeping...') + + time.sleep(SLEEP_WHEN_EMPTY) + continue + + sleeping = False + + # process request + logger.info('Processing {}'.format(importable_uri)) + try: + errors = process_request(importable_uri) + except Exception as e: + errors = 'Error: Failed to process ImportableURI: {}\n'.format( + repr(importable_uri)) + errors += get_error_message(e) + finally: + if errors: + importable_uri.processing_error = errors + logger.error(errors) + importable_uri.processed_date = timezone.now() + importable_uri.wip_date = None + importable_uri.save() + processed_counter += 1 + + return processed_counter + + +def process_request(importable_uri): + uri = importable_uri.uri + uri = uri.rstrip('/') + data = importable_uri.data + if not data: + # collect data again if we don't have it + response = requests.get(uri) + if response: + data = requests.text + + purl = importable_uri.package_url + if purl: + package_url = PackageURL.from_string(purl) + namespace = package_url.namespace + name = package_url.name + else: + namespace, name, _ = determine_namespace_name_version_from_url(uri) + + timestamps_by_directory_links = collect_links_from_text(data, filter_only_directories) + # Go into each version directory + for directory_link in timestamps_by_directory_links.keys(): + version = directory_link.rstrip('/') + version_page_url = f'{uri}/{version}' + timestamps_by_artifact_links = get_artifact_links(version_page_url) + for artifact_link, timestamp in timestamps_by_artifact_links.items(): + sha1 = get_artifact_sha1(artifact_link) + classifier = get_classifier_from_artifact_url(artifact_link, version_page_url, name, version) + qualifiers = None + if classifier: + qualifiers = f'classifier={classifier}' + release_date = dateutil_parse(timestamp) + package_data = PackageData( + type='maven', + namespace=namespace, + name=name, + version=version, + qualifiers=qualifiers, + download_url=artifact_link, + sha1=sha1, + release_date=release_date, + ) + package, created, merged, map_error = merge_or_create_package( + scanned_package=package_data, + visit_level=50 + ) + if created: + logger.info(f'Created package {package}') + if merged: + logger.info(f'Updated package {package}') + if map_error: + logger.error(f'Error encountered: {map_error}') + importable_uri.processing_error = map_error + importable_uri.save() diff --git a/minecode/management/commands/maven_crawler.py b/minecode/management/commands/maven_crawler.py new file mode 100644 index 00000000..30c8f360 --- /dev/null +++ b/minecode/management/commands/maven_crawler.py @@ -0,0 +1,31 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import logging +import sys + +from minecode.management.commands import VerboseCommand +from minecode.visitors.maven import crawl_maven_repo_from_root + + +logger = logging.getLogger(__name__) +logging.basicConfig(stream=sys.stdout) +logger.setLevel(logging.INFO) + +TRACE = False +if TRACE: + logger.setLevel(logging.DEBUG) + + +class Command(VerboseCommand): + help = 'Run a Package request queue.' + + def handle(self, *args, **options): + maven_root_url = 'https://repo.maven.apache.org/maven2' + crawl_maven_repo_from_root(root_url=maven_root_url) diff --git a/minecode/migrations/0031_importableuri.py b/minecode/migrations/0031_importableuri.py new file mode 100644 index 00000000..0d557312 --- /dev/null +++ b/minecode/migrations/0031_importableuri.py @@ -0,0 +1,181 @@ +# Generated by Django 4.1.2 on 2023-09-12 00:14 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("minecode", "0030_scannableuri_rescan_alter_scannableuri_scan_status"), + ] + + operations = [ + migrations.CreateModel( + name="ImportableURI", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "uri", + models.CharField( + db_index=True, + help_text="URI for this resource. This is the unmodified original URI.", + max_length=2048, + ), + ), + ( + "canonical", + models.CharField( + db_index=True, + help_text="Canonical form of the URI for this resource that must be unique across all ResourceURI.", + max_length=3000, + ), + ), + ( + "source_uri", + models.CharField( + blank=True, + help_text="Optional: real source remote URI for this visit.For example for a package repository index is a typical source via which a first level of package data is fetched. And it is not the URI in the uri field. It is just the source of the fetchOr the source may be a mirror URI used for fetching.", + max_length=2048, + null=True, + ), + ), + ( + "priority", + models.PositiveIntegerField( + db_index=True, + default=0, + help_text="Absolute procdssing priority of a URI (default to zero), higher number means higher priority, zero means lowest priority.", + ), + ), + ( + "wip_date", + models.DateTimeField( + blank=True, + db_index=True, + help_text="Work In Progress. This is a timestamp set at the start of a visit or mapping or indexing or null when no processing is in progress.", + null=True, + ), + ), + ( + "file_name", + models.CharField( + blank=True, + db_index=True, + help_text="File name of a resource sometimes part of the URI proper and sometimes only available through an HTTP header.", + max_length=255, + null=True, + ), + ), + ( + "size", + models.PositiveIntegerField( + blank=True, + db_index=True, + help_text="Size in bytes of the file represented by this ResourceURI.", + null=True, + ), + ), + ( + "sha1", + models.CharField( + blank=True, + db_index=True, + help_text="SHA1 checksum hex-encoded (as in the sha1sum command) of the content of the file represented by this ResourceURI.", + max_length=40, + null=True, + ), + ), + ( + "md5", + models.CharField( + blank=True, + db_index=True, + help_text="MD5 checksum hex-encoded (as in the md5sum command) of the content of the file represented by this ResourceURI.", + max_length=32, + null=True, + ), + ), + ( + "sha256", + models.CharField( + blank=True, + db_index=True, + help_text="SHA256 checksum hex-encoded (as in the sha256sum command) of the content of the file represented by this ResourceURI.", + max_length=64, + null=True, + ), + ), + ( + "last_modified_date", + models.DateTimeField( + blank=True, + db_index=True, + help_text="Timestamp set to the last modified date of the remote resource represented by this URI such as the modified date of a file, the lastmod value on a sitemap or the modified date returned by an HTTP resource.", + null=True, + ), + ), + ( + "package_url", + models.CharField( + blank=True, + db_index=True, + help_text='Package URL for this resource. It stands for a package "mostly universal" URL.', + max_length=2048, + null=True, + ), + ), + ( + "data", + models.TextField( + blank=True, + help_text="Text content of the file represented by this ResourceURI. This contains the data that was fetched or extracted from a remote ResourceURI such as HTML or JSON.", + null=True, + ), + ), + ( + "request_date", + models.DateTimeField( + blank=True, + db_index=True, + help_text="Timestamp set to the date of when this Package info was requested.", + null=True, + ), + ), + ( + "processed_date", + models.DateTimeField( + blank=True, + db_index=True, + help_text="Timestamp set to the date of when this Package info was processed.", + null=True, + ), + ), + ( + "has_processing_error", + models.BooleanField( + db_index=True, + default=False, + help_text="When set to True (Yes), this field indicates that an error has occured when processing this URI.", + ), + ), + ( + "processing_error", + models.TextField( + blank=True, + help_text="Processing errors messages. When present this means the processing failed.", + null=True, + ), + ), + ], + options={ + "verbose_name": "Importable URI", + }, + ), + ] diff --git a/minecode/models.py b/minecode/models.py index 16c44854..3a6f046e 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -31,10 +31,6 @@ logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.INFO) -# logger = logging.getLogger(__name__) -# handler = logging.StreamHandler() -# logger.addHandler(handler) - def get_canonical(uri): """ @@ -936,3 +932,134 @@ def save(self, *args, **kwargs): """ self.normalize_fields() super(PriorityResourceURI, self).save(*args, **kwargs) + + +# TODO: Use the QuerySet.as_manager() for more flexibility and chaining. +class ImportableURIManager(models.Manager): + def insert(self, uri, data, package_url, **extra_fields): + """ + Create and return a new ImportableURI + Return None if the insertion failed when the same URI exists with the same versions to be collected + """ + # TODO: be able to create a request for an existing purl if the previous request has been completed already + + importable_uris = self.filter( + uri=uri, + **extra_fields + ) + if ( + importable_uris.count() == 0 + or all(p.processed_date for p in importable_uris) + ): + importable_uri = self.create( + uri=uri, + data=data, + package_url=package_url, + **extra_fields + ) + return importable_uri + + def in_progress(self): + """ + Limit the QuerySet to ImportableURI being processed. + """ + return self.filter(wip_date__isnull=False) + + def never_processed(self): + """ + Limit the QuerySet to ImportableURIs that have never been processed. + This is usually the state of a ImportableURI after upon creation. + """ + return self.filter( + processed_date__isnull=True, + wip_date__isnull=True + ).order_by( + 'request_date' + ) + + def get_requests(self): + """ + Return an ordered query set of all processable ImportableURIs. + """ + never_processed = self.never_processed() + return never_processed + + def get_next_request(self): + """ + Return the next ImportableURI request for processing and mark it + as being "in_progress" by setting the wip_date field. + + Return None when there is no request left to visit. + + NOTE: this method can only be called from within a transaction.atomic + block. + """ + importable_uri = self.get_requests().select_for_update(skip_locked=True).first() + if not importable_uri: + return + importable_uri.wip_date = timezone.now() + importable_uri.save(update_fields=['wip_date']) + return importable_uri + + +# TODO: have a second queue for crawling maven repo, that tracks which pages and namespaces we visited +# when we hit the point of a package page, we add it to the queue that creates skinny packages for the package we visited. + +class ImportableURI(BaseURI): + package_url = models.CharField( + max_length=2048, + null=True, + blank=True, + db_index=True, + help_text="""Package URL for this resource. It stands for a package "mostly universal" URL.""" + ) + + # This is a text blob that contains either HTML, JSON or anything + # stored as a string. This is the raw content of visiting a URI. + # NOTE: some visited URLS (such as an actual package archive will/shoud NOT be stored there) + data = models.TextField( + null=True, + blank=True, + help_text='Text content of the file represented by this ' + 'ResourceURI. This contains the data that was fetched or ' + 'extracted from a remote ResourceURI such as HTML or JSON.', + ) + + request_date = models.DateTimeField( + null=True, + blank=True, + db_index=True, + help_text='Timestamp set to the date of when this Package info was requested.', + ) + + processed_date = models.DateTimeField( + null=True, + blank=True, + db_index=True, + help_text='Timestamp set to the date of when this Package info was processed.', + ) + + has_processing_error = models.BooleanField( + db_index=True, + default=False, + help_text='When set to True (Yes), this field indicates that ' + 'an error has occured when processing this URI.' + ) + + processing_error = models.TextField( + null=True, + blank=True, + help_text='Processing errors messages. When present this means the processing failed.', + ) + + objects = ImportableURIManager() + + class Meta: + verbose_name = 'Importable URI' + + def save(self, *args, **kwargs): + """ + Save, adding defaults for computed fields and validating fields. + """ + self.normalize_fields() + super(ImportableURI, self).save(*args, **kwargs) diff --git a/minecode/tests/test_maven.py b/minecode/tests/test_maven.py index 8584f49d..2375f20b 100644 --- a/minecode/tests/test_maven.py +++ b/minecode/tests/test_maven.py @@ -868,3 +868,315 @@ def test_get_merged_ancestor_package_from_maven_package(self, get_pom_text_mock, merged_package = maven_visitor.get_merged_ancestor_package_from_maven_package(package=db_package) expected_loc = self.get_test_loc('maven/pom/pulsar-client-merged-ancestor-package.json') self.check_expected_results(merged_package.to_dict(), expected_loc, regen=regen) + + +class MavenCrawlerFunctionsTest(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + + def test_check_if_file_name_is_linked_on_page(self): + links = ['foo/', 'bar/', 'baz/'] + self.assertTrue( + maven_visitor.check_if_file_name_is_linked_on_page('foo/', links) + ) + self.assertFalse( + maven_visitor.check_if_file_name_is_linked_on_page('qux/', links) + ) + + def test_check_if_page_has_pom_files(self): + links1 = ['foo/', 'bar.jar', 'bar.pom'] + links2 = ['foo/', 'bar.jar'] + self.assertTrue(maven_visitor.check_if_page_has_pom_files(links1)) + self.assertFalse(maven_visitor.check_if_page_has_pom_files(links2)) + + def test_check_if_page_has_directories(self): + links1 = ['foo/', 'bar/', 'baz/'] + links2 = ['../', 'bar.pom', 'bar.jar'] + self.assertTrue(maven_visitor.check_if_page_has_directories(links1)) + self.assertFalse(maven_visitor.check_if_page_has_directories(links2)) + + def test_check_if_package_version_page(self): + links1 = ['../', 'bar.pom', 'bar.jar'] + links2 = ['../', 'foo/', 'bar/', 'baz/'] + self.assertTrue(maven_visitor.check_if_package_version_page(links1)) + self.assertFalse(maven_visitor.check_if_package_version_page(links2)) + + def test_check_if_package_page(self): + links1 = ['../', 'maven-metadata.xml'] + links2 = ['../', 'bar.pom', 'bar.jar'] + self.assertTrue(maven_visitor.check_if_package_page(links1)) + self.assertFalse(maven_visitor.check_if_package_page(links2)) + + def test_check_if_maven_root(self): + links1 = ['../', 'archetype-catalog.xml'] + links2 = ['../', 'bar.pom', 'bar.jar'] + self.assertTrue(maven_visitor.check_if_maven_root(links1)) + self.assertFalse(maven_visitor.check_if_maven_root(links2)) + + @mock.patch('requests.get') + def test_check_on_page(self, mock_request_get): + checker = maven_visitor.check_if_page_has_pom_files + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = 'parent-7.11.0.pom' + self.assertTrue(maven_visitor.check_on_page('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/', checker)) + + @mock.patch('requests.get') + def test_is_maven_root(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = 'archetype-catalog.xml' + self.assertTrue(maven_visitor.is_maven_root('https://repo1.maven.org/maven2/')) + + @mock.patch('requests.get') + def test_is_package_page(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = 'maven-metadata.xml' + self.assertTrue(maven_visitor.is_package_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/')) + + @mock.patch('requests.get') + def test_is_package_version_page(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = ''' + ../ + parent-7.11.0.pom + ''' + self.assertTrue(maven_visitor.is_package_version_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/')) + + def test_url_parts(self): + url = 'https://example.com/foo/bar/baz.jar' + scheme, netloc, path_segments = maven_visitor.url_parts(url) + self.assertEqual('https', scheme) + self.assertEqual('example.com', netloc) + self.assertEquals(['foo', 'bar', 'baz.jar'], path_segments) + + def test_create_url(self): + scheme = 'https' + netloc = 'example.com' + path_segments = ['foo', 'bar', 'baz.jar'] + url = 'https://example.com/foo/bar/baz.jar' + self.assertEqual( + url, + maven_visitor.create_url(scheme, netloc, path_segments) + ) + + @mock.patch('requests.get') + def test_get_maven_root(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = 'archetype-catalog.xml' + self.assertEqual( + 'https://repo1.maven.org/maven2', + maven_visitor.get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + ) + + @mock.patch('requests.get') + def test_determine_namespace_name_version_from_url(self, mock_request_get): + url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2' + root_url = 'https://repo1.maven.org/maven2' + + package_page_text = ''' + 1.0.b2/ + 2005-09-20 05:53 - + maven-metadata.xml + 2012-06-26 17:01 567 + ''' + package_page = mock.Mock(ok=True, text=package_page_text) + + package_version_page_text = ''' + ../ - + xml-apis-1.0.b2.pom + 2005-09-20 05:53 2249 + ''' + package_version_page = mock.Mock(ok=True, text=package_version_page_text) + mock_request_get.side_effect = [ + mock.Mock(ok=True, text=''), + mock.Mock(ok=True, text=''), + package_page, + mock.Mock(ok=True, text=''), + package_version_page + ] + + namespace, package_name, package_version = maven_visitor.determine_namespace_name_version_from_url(url, root_url) + self.assertEqual('xml-apis', namespace) + self.assertEqual('xml-apis', package_name) + self.assertEqual('1.0.b2', package_version) + + @mock.patch('requests.get') + def test_add_to_import_queue(self, mock_request_get): + from minecode.models import ImportableURI + + url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' + root_url = 'https://repo1.maven.org/maven2' + + package_page_text = ''' + 1.0.b2/ + 2005-09-20 05:53 - + maven-metadata.xml + 2012-06-26 17:01 567 + ''' + package_page = mock.Mock(ok=True, text=package_page_text) + + package_version_page_text = ''' + ../ - + xml-apis-1.0.b2.pom + 2005-09-20 05:53 2249 + ''' + package_version_page = mock.Mock(ok=True, text=package_version_page_text) + mock_request_get.side_effect = [ + package_page, + mock.Mock(ok=True, text=''), + mock.Mock(ok=True, text=''), + package_page, + mock.Mock(ok=True, text=''), + package_version_page + ] + + self.assertEqual(0, ImportableURI.objects.all().count()) + maven_visitor.add_to_import_queue(url, root_url ) + self.assertEqual(1, ImportableURI.objects.all().count()) + importable_uri = ImportableURI.objects.get(uri=url) + self.assertEqual('pkg:maven/xml-apis/xml-apis', importable_uri.package_url) + + def test_filter_only_directories(self): + timestamps_by_links = { + '../': '-', + 'foo/': '-', + 'foo.pom': '2023-09-28', + } + expected = { + 'foo/': '-', + } + self.assertEqual( + expected, + maven_visitor.filter_only_directories(timestamps_by_links) + ) + + def test_filter_for_artifacts(self): + timestamps_by_links = { + '../': '2023-09-28', + 'foo.pom': '2023-09-28', + 'foo.ejb3': '2023-09-28', + 'foo.ear': '2023-09-28', + 'foo.aar': '2023-09-28', + 'foo.apk': '2023-09-28', + 'foo.gem': '2023-09-28', + 'foo.jar': '2023-09-28', + 'foo.nar': '2023-09-28', + 'foo.so': '2023-09-28', + 'foo.swc': '2023-09-28', + 'foo.tar': '2023-09-28', + 'foo.tar.gz': '2023-09-28', + 'foo.war': '2023-09-28', + 'foo.xar': '2023-09-28', + 'foo.zip': '2023-09-28', + } + expected = { + 'foo.ejb3': '2023-09-28', + 'foo.ear': '2023-09-28', + 'foo.aar': '2023-09-28', + 'foo.apk': '2023-09-28', + 'foo.gem': '2023-09-28', + 'foo.jar': '2023-09-28', + 'foo.nar': '2023-09-28', + 'foo.so': '2023-09-28', + 'foo.swc': '2023-09-28', + 'foo.tar': '2023-09-28', + 'foo.tar.gz': '2023-09-28', + 'foo.war': '2023-09-28', + 'foo.xar': '2023-09-28', + 'foo.zip': '2023-09-28', + } + self.assertEqual(expected, maven_visitor.filter_for_artifacts(timestamps_by_links)) + + def test_collect_links_from_text(self): + filter = maven_visitor.filter_only_directories + text = ''' + ../ + 1.0.b2/ + 2005-09-20 05:53 - + 1.2.01/ + 2010-02-03 21:05 - + ''' + expected = { + '1.0.b2/': '2005-09-20 05:53', + '1.2.01/': '2010-02-03 21:05' + } + self.assertEqual( + expected, + maven_visitor.collect_links_from_text(text, filter=filter) + ) + + def test_create_absolute_urls_for_links(self): + filter = maven_visitor.filter_only_directories + text = ''' + ../ + 1.0.b2/ + 2005-09-20 05:53 - + 1.2.01/ + 2010-02-03 21:05 - + ''' + url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' + expected = { + 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53', + 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05' + } + self.assertEqual( + expected, + maven_visitor.create_absolute_urls_for_links(text, url, filter=filter) + ) + + @mock.patch('requests.get') + def test_get_directory_links(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = ''' + ../ + 1.0.b2/ + 2005-09-20 05:53 - + 1.2.01/ + 2010-02-03 21:05 - + ''' + url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' + expected = { + 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53', + 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05' + } + self.assertEqual(expected, maven_visitor.get_directory_links(url)) + + @mock.patch('requests.get') + def test_get_artifact_links(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = ''' + ../ + xml-apis-1.0.b2.jar + 2005-09-20 05:53 109318 + xml-apis-1.0.b2.pom + 2005-09-20 05:53 2249 + ''' + url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/' + expected = { + 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar': '2005-09-20 05:53', + } + self.assertEqual(expected, maven_visitor.get_artifact_links(url)) + + def test_crawl_to_package(self): + pass + + def test_crawl_maven_repo_from_root(self): + pass + + @mock.patch('requests.get') + def test_get_artifact_sha1(self, mock_request_get): + sha1 = '3136ca936f64c9d68529f048c2618bd356bf85c9' + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = sha1 + self.assertEqual(sha1, maven_visitor.get_artifact_sha1('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar.sha1')) + + def test_get_classifier_from_artifact_url(self): + artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' + package_version_page_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/' + package_name = 'livereload-jvm' + package_version = '0.2.0' + classifier = maven_visitor.get_classifier_from_artifact_url( + artifact_url, + package_version_page_url, + package_name, + package_version + ) + self.assertEqual('onejar', classifier) diff --git a/minecode/tests/testfiles/maven/end2end/expected_mapped_packages.json b/minecode/tests/testfiles/maven/end2end/expected_mapped_packages.json index 5dfd490a..8fdc7fae 100644 --- a/minecode/tests/testfiles/maven/end2end/expected_mapped_packages.json +++ b/minecode/tests/testfiles/maven/end2end/expected_mapped_packages.json @@ -9,7 +9,7 @@ "package_content":null, "primary_language":null, "description":"APIs that App Engine provides to you to build your application.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -53,7 +53,7 @@ "package_content":null, "primary_language":null, "description":null, - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -97,7 +97,7 @@ "package_content":null, "primary_language":null, "description":null, - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -141,7 +141,7 @@ "package_content":null, "primary_language":null, "description":"Library which allows discovering classes at runtime", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -185,7 +185,7 @@ "package_content":null, "primary_language":null, "description":"Library which allows discovering classes at runtime", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -229,7 +229,7 @@ "package_content":null, "primary_language":null, "description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -273,7 +273,7 @@ "package_content":null, "primary_language":null, "description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -317,7 +317,7 @@ "package_content":null, "primary_language":null, "description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -361,7 +361,7 @@ "package_content":null, "primary_language":null, "description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -405,7 +405,7 @@ "package_content":null, "primary_language":null, "description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -449,7 +449,7 @@ "package_content":null, "primary_language":null, "description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -493,7 +493,7 @@ "package_content":null, "primary_language":null, "description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -537,7 +537,7 @@ "package_content":null, "primary_language":null, "description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -581,7 +581,7 @@ "package_content":null, "primary_language":null, "description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -625,7 +625,7 @@ "package_content":null, "primary_language":null, "description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -669,7 +669,7 @@ "package_content":null, "primary_language":null, "description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -713,7 +713,7 @@ "package_content":null, "primary_language":null, "description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -757,7 +757,7 @@ "package_content":null, "primary_language":null, "description":"The Social Graph Node Mapper is a community project to build a portable library to map social networking sites' URLs to and from a new canonical form (sgn:// URLs).", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, @@ -801,7 +801,7 @@ "package_content":null, "primary_language":null, "description":"The Social Graph Node Mapper is a community project to build a portable library to map social networking sites' URLs to and from a new canonical form (sgn:// URLs).", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, diff --git a/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json b/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json index bc9d0ae4..a31a465f 100644 --- a/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json +++ b/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json @@ -9,7 +9,7 @@ "package_content":null, "primary_language":null, "description":"Common classes to make creating REST services more consistent.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, diff --git a/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json b/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json index bc9d0ae4..a31a465f 100644 --- a/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json +++ b/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json @@ -9,7 +9,7 @@ "package_content":null, "primary_language":null, "description":"Common classes to make creating REST services more consistent.", - "release_date":"2009-05-21", + "release_date":"2009-05-21T00:00:00Z", "parties":[], "keywords":[], "homepage_url":null, diff --git a/minecode/tests/testfiles/rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json b/minecode/tests/testfiles/rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json index 3513944b..643c8892 100644 --- a/minecode/tests/testfiles/rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json +++ b/minecode/tests/testfiles/rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json @@ -9,7 +9,7 @@ "package_content":null, "primary_language":null, "description":"Get the vendored assets paths in gems.", - "release_date":"2012-08-03", + "release_date":"2012-08-03T00:00:00Z", "parties":[ { "type":null, diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py index 7fd70ac7..55624772 100644 --- a/minecode/visitors/maven.py +++ b/minecode/visitors/maven.py @@ -13,7 +13,9 @@ import io import json import logging +import re from typing import Dict +from urllib.parse import urlparse import arrow import requests @@ -305,7 +307,7 @@ def map_maven_package(package_url, package_content): ancestor_pom_texts=ancestor_pom_texts, package=package ) - + urls = get_urls( namespace=package_url.namespace, @@ -453,6 +455,344 @@ def process_request(purl_str): return error +collect_links = re.compile(r'href="([^"]+)"').findall +collect_links_and_artifact_timestamps = re.compile( + r'\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)' +).findall + + +def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): + """ + Return True if `file_name` is in `links` + """ + return any(l.endswith(file_name) for l in links) + + +def check_if_page_has_pom_files(links, **kwargs): + """ + Return True of any entry in `links` ends with .pom. + """ + return any(l.endswith('.pom') for l in links) + + +def check_if_page_has_directories(links, **kwargs): + """ + Return True if any entry, excluding "../", ends with /. + """ + return any(l.endswith('/') for l in links if l != '../') + + +def check_if_package_version_page(links, **kwargs): + """ + Return True if `links` contains pom files and has no directories + """ + return ( + check_if_page_has_pom_files(links=links) + and not check_if_page_has_directories(links=links) + ) + + +def check_if_package_page(links, **kwargs): + return ( + check_if_file_name_is_linked_on_page(file_name='maven-metadata.xml', links=links) + and not check_if_page_has_pom_files(links=links) + ) + + +def check_if_maven_root(links, **kwargs): + """ + Return True if "archetype-catalog.xml" is in `links`, as the root of a Maven + repo contains "archetype-catalog.xml". + """ + return check_if_file_name_is_linked_on_page(file_name='archetype-catalog.xml', links=links) + + +def check_on_page(url, checker): + """ + Return True if there is a link on `url` that is the same as `file_name`, + False otherwise. + """ + response = requests.get(url) + if response: + links = collect_links(response.text) + return checker(links=links) + return False + + +def is_maven_root(url): + """ + Return True if `url` is the root of a Maven repo, False otherwise. + """ + return check_on_page(url, check_if_maven_root) + + +def is_package_page(url): + """ + Return True if `url` is a package page on a Maven repo, False otherwise. + """ + return check_on_page(url, check_if_package_page) + + +def is_package_version_page(url): + """ + Return True if `url` is a package version page on a Maven repo, False otherwise. + """ + return check_on_page(url, check_if_package_version_page) + + +def url_parts(url): + parsed_url = urlparse(url) + scheme = parsed_url.scheme + netloc = parsed_url.netloc + path_segments = [p for p in parsed_url.path.split('/') if p] + return scheme, netloc, path_segments + + +def create_url(scheme, netloc, path_segments): + url_template = f'{scheme}://{netloc}' + path = '/'.join(path_segments) + return f'{url_template}/{path}' + + +def get_maven_root(url): + """ + Given `url`, that is a URL to namespace, package, or artifact in a Maven + repo, return the URL to the root of that repo. If a Maven root cannot be + determined, return None. + + >>> get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + 'https://repo1.maven.org/maven2' + """ + scheme, netloc, path_segments = url_parts(url) + for i in range(len(path_segments)): + segments = path_segments[:i+1] + url_segment = create_url(scheme, netloc, segments) + if is_maven_root(url_segment): + return url_segment + return None + + +def determine_namespace_name_version_from_url(url, root_url=None): + """ + Return a 3-tuple containing strings of a Package namespace, name, and + version, determined from `url`, where `url` points to namespace, package, + specific package version, or artifact on a Maven repo. + + Return None if a Maven root cannot be determined from `url`. + + >>> determine_namespace_name_version_from_url('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + ('net.shibboleth', 'parent', '7.11.0') + """ + if not root_url: + root_url = get_maven_root(url) + if not root_url: + raise Exception(f'Error: not a Maven repository: {url}') + + _, remaining_path_segments = url.split(root_url) + remaining_path_segments = remaining_path_segments.split('/') + remaining_path_segments = [p for p in remaining_path_segments if p] + + namespace_segments = [] + package_name = '' + package_version = '' + for i in range(len(remaining_path_segments)): + segment = remaining_path_segments[i] + segments = remaining_path_segments[:i+1] + path = '/'.join(segments) + url_segment = f'{root_url}/{path}' + if is_package_page(url_segment): + package_name = segment + elif is_package_version_page(url_segment): + package_version = segment + else: + namespace_segments.append(segment) + namespace = '.'.join(namespace_segments) + return namespace, package_name, package_version + + +def add_to_import_queue(url, root_url): + """ + Create ImportableURI for the Maven repo package page at `url`. + """ + from minecode.models import ImportableURI + data = None + response = requests.get(url) + if response: + data = response.text + namespace, name, _ = determine_namespace_name_version_from_url(url, root_url) + purl = PackageURL( + type='maven', + namespace=namespace, + name=name, + ) + importable_uri = ImportableURI.objects.insert(url, data, purl) + if importable_uri: + logger.info(f'Inserted {url} into ImportableURI queue') + + +def filter_only_directories(timestamps_by_links): + """ + Given a mapping of `timestamps_by_links`, where the links are directory names (which end with `/`), + """ + timestamps_by_links_filtered = {} + for link, timestamp in timestamps_by_links.items(): + if link != '../' and link.endswith('/'): + timestamps_by_links_filtered[link] = timestamp + return timestamps_by_links_filtered + + +valid_artifact_extensions = [ + 'ejb3', + 'ear', + 'aar', + 'apk', + 'gem', + 'jar', + 'nar', + # 'pom', + 'so', + 'swc', + 'tar', + 'tar.gz', + 'war', + 'xar', + 'zip', +] + + +def filter_for_artifacts(timestamps_by_links): + """ + Given a mapping of `timestamps_by_links`, where the links are the filenames + of Maven artifacts, return a mapping of filenames whose extension is in + `valid_artifact_extensions` and their timestamps. + """ + timestamps_by_links_filtered = {} + for link, timestamp in timestamps_by_links.items(): + for ext in valid_artifact_extensions: + if link.endswith(ext): + timestamps_by_links_filtered[link] = timestamp + return timestamps_by_links_filtered + + +def collect_links_from_text(text, filter): + """ + Return a mapping of link locations and their timestamps, given HTML `text` + content, that is filtered using `filter`. + """ + links_and_timestamps = collect_links_and_artifact_timestamps(text) + timestamps_by_links = {} + for link, timestamp in links_and_timestamps: + if timestamp == '-': + timestamp = '' + timestamps_by_links[link] = timestamp + + timestamps_by_links = filter(timestamps_by_links=timestamps_by_links) + return timestamps_by_links + + +def create_absolute_urls_for_links(text, url, filter): + """ + Given the `text` contents from `url`, return a mapping of absolute URLs to + links from `url` and their timestamps, that is then filtered by `filter`. + """ + timestamps_by_absolute_links = {} + url = url.rstrip('/') + timestamps_by_links = collect_links_from_text(text, filter) + for link, timestamp in timestamps_by_links.items(): + if not link.startswith(url): + link = f'{url}/{link}' + timestamps_by_absolute_links[link] = timestamp + return timestamps_by_absolute_links + + +def get_directory_links(url): + """ + Return a list of absolute directory URLs of the hyperlinks from `url` + """ + timestamps_by_directory_links = {} + response = requests.get(url) + if response: + timestamps_by_directory_links = create_absolute_urls_for_links( + response.text, + url=url, + filter=filter_only_directories + ) + return timestamps_by_directory_links + + +def get_artifact_links(url): + """ + Return a list of absolute directory URLs of the hyperlinks from `url` + """ + timestamps_by_artifact_links = [] + response = requests.get(url) + if response: + timestamps_by_artifact_links = create_absolute_urls_for_links( + response.text, + url=url, + filter=filter_for_artifacts + ) + return timestamps_by_artifact_links + + +def crawl_to_package(url, root_url): + """ + Given a maven repo `url`, + """ + if is_package_page(url): + add_to_import_queue(url, root_url) + return + + for link in get_directory_links(url): + crawl_to_package(link, root_url) + + +def crawl_maven_repo_from_root(root_url): + """ + Given the `url` to a maven root, traverse the repo depth-first and add + packages to the import queue. + """ + crawl_to_package(root_url, root_url) + + +def get_artifact_sha1(artifact_url): + """ + Return the SHA1 value of the Maven artifact located at `artifact_url`. + """ + sha1 = None + artifact_sha1_url = f'{artifact_url}.sha1' + response = requests.get(artifact_sha1_url) + if response: + sha1_contents = response.text.strip().split() + sha1 = sha1_contents[0] + sha1 = validate_sha1(sha1) + return sha1 + + +def get_classifier_from_artifact_url(artifact_url, package_version_page_url, package_name, package_version): + """ + Return the classifier from a Maven artifact URL `artifact_url`, otherwise + return None if a classifier cannot be determined from `artifact_url` + """ + classifier = None + # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0 + package_version_page_url = package_version_page_url.rstrip('/') + # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0 + leading_url_portion = f'{package_version_page_url}/{package_name}-{package_version}' + # artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' + # ['', '-onejar.jar'] + _, remaining_url_portion = artifact_url.split(leading_url_portion) + # ['-onejar', 'jar'] + remaining_url_portions = remaining_url_portion.split('.') + if remaining_url_portions and remaining_url_portions[0]: + # '-onejar' + classifier = remaining_url_portions[0] + if classifier.startswith('-'): + # 'onejar' + classifier = classifier[1:] + return classifier + + @visit_router.route('http://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') @visit_router.route('https://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') class MavenNexusPropertiesVisitor(NonPersistentHttpVisitor): diff --git a/packagedb/api.py b/packagedb/api.py index d8d7f59f..3dd17d7e 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -9,7 +9,9 @@ import logging from django.core.exceptions import ValidationError +from django.db.models import OuterRef from django.db.models import Q +from django.db.models import Subquery from django_filters.rest_framework import FilterSet from django_filters.filters import Filter from django_filters.filters import OrderingFilter @@ -565,12 +567,20 @@ def filter_by_checksums(self, request, *args, **kwargs): lookups = Q() for field, value in data.items(): + # Subquery to get the ids of the Packages with the earliest release_date for each `field` + earliest_release_dates = Package.objects.filter( + **{field: OuterRef(field)} + ).order_by('release_date').values('id')[:1] + value = value or [] - # We create this intermediate dictionary so we can modify the field - # name to have __in at the end - d = {f'{field}__in': value} - lookups |= Q(**d) + lookups |= Q( + **{ + f'{field}__in': value, + 'id__in': Subquery(earliest_release_dates), + } + ) + # Query to get the full Package objects with the earliest release_date for each sha1 qs = Package.objects.filter(lookups) paginated_qs = self.paginate_queryset(qs) if enhance_package_data: @@ -803,7 +813,7 @@ def get_all_versions(purl: PackageURL): except InvalidVersion: logger.warning(f"Invalid version '{package_version.value}' for '{purl}'") pass - + return result diff --git a/packagedb/migrations/0047_add_search_vector_field_to_package.py b/packagedb/migrations/0047_add_search_vector_field_to_package.py index c2687a27..9eccd785 100644 --- a/packagedb/migrations/0047_add_search_vector_field_to_package.py +++ b/packagedb/migrations/0047_add_search_vector_field_to_package.py @@ -1,6 +1,6 @@ # Generated by Django 3.1.5 on 2021-03-10 19:04 -import django.contrib.postgres.search +from django.contrib.postgres.search import SearchVector, SearchVectorField from django.db import migrations @@ -9,10 +9,26 @@ def populate_search_vector_field(apps, schema_editor): Data migration used to lowercase any purl field values that currently exist. """ Package = apps.get_model('packagedb', 'Package') - - for pkg in Package.objects.iterator(): - pkg.search_vector = search.SearchVector('namespace', 'name', 'version', 'download_url') - pkg.save() + resource_uris = Package.objects.iterator(chunk_size=5000) + updated = [] + for i, package in enumerate(resource_uris): + if not i % 5000: + Package.objects.bulk_update( + objs=updated, + fields=[ + 'search_vector', + ] + ) + updated = [] + package.search_vector = SearchVector('namespace', 'name', 'version', 'download_url') + updated.append(package) + if updated: + Package.objects.bulk_update( + objs=updated, + fields=[ + 'search_vector', + ] + ) class Migration(migrations.Migration): @@ -25,7 +41,7 @@ class Migration(migrations.Migration): migrations.AddField( model_name='package', name='search_vector', - field=django.contrib.postgres.search.SearchVectorField(null=True), + field=SearchVectorField(null=True), ), migrations.RunPython(populate_search_vector_field), ] diff --git a/packagedb/migrations/0059_compute_package_license_data.py b/packagedb/migrations/0059_compute_package_license_data.py index c57d14e9..109c4254 100644 --- a/packagedb/migrations/0059_compute_package_license_data.py +++ b/packagedb/migrations/0059_compute_package_license_data.py @@ -9,18 +9,51 @@ def compute_package_declared_license_expression_spdx(apps, schema_editor): Compute Package `declared_license_expression_spdx`, when missing, from `declared_license_expression`, when available. """ - from licensedcode.cache import build_spdx_license_expression + from licensedcode.cache import build_spdx_license_expression, InvalidLicenseKeyError + from packageurl import PackageURL Package = apps.get_model('packagedb', 'Package') packages = Package.objects.filter( ~Q(declared_license_expression="") & Q(declared_license_expression_spdx="") | Q(declared_license_expression__isnull=False) & Q(declared_license_expression_spdx__isnull=True) ) + package_count = packages.count() + chunk_size = 2000 + iterator = packages.iterator(chunk_size=chunk_size) + updated = [] + for i, package in enumerate(iterator): + if (not i % chunk_size) and updated: + Package.objects.bulk_update( + objs=updated, + fields=[ + 'declared_license_expression_spdx', + ] + ) + updated = [] + print(f" {i:,} / {package_count:,} computed and updated") + try: + if spdx := build_spdx_license_expression(package.declared_license_expression): + package.declared_license_expression_spdx = spdx + updated.append(package) + except InvalidLicenseKeyError as e: + package_url = PackageURL( + type=package.type, + namespace=package.namespace, + name=package.name, + version=package.version, + qualifiers=package.qualifiers, + subpath=package.subpath + ) + print(f" Error processing {package_url}: {e}") - for package in packages: - if spdx := build_spdx_license_expression(package.declared_license_expression): - package.declared_license_expression_spdx = spdx - package.save() + if updated: + print("Updating remaining Packages...") + Package.objects.bulk_update( + objs=updated, + fields=[ + 'declared_license_expression_spdx', + ] + ) class Migration(migrations.Migration): diff --git a/packagedb/migrations/0062_compute_resource_license_data.py b/packagedb/migrations/0062_compute_resource_license_data.py index 0cef20fc..f1b996c8 100644 --- a/packagedb/migrations/0062_compute_resource_license_data.py +++ b/packagedb/migrations/0062_compute_resource_license_data.py @@ -13,13 +13,13 @@ def compute_resource_detected_license_expression(apps, schema_editor): From scancode.io """ from license_expression import combine_expressions - from licensedcode.cache import build_spdx_license_expression + from licensedcode.cache import build_spdx_license_expression, InvalidLicenseKeyError if settings.IS_TESTS: return Resource = apps.get_model("packagedb", "Resource") - resources = Resource.objects.filter(~Q(license_expressions=[]) | Q(license_expressions__isnull=False)).only('license_expressions') + resources = Resource.objects.filter(~Q(license_expressions=[])).filter(license_expressions__is_null=False) object_count = resources.count() print(f"\nCompute detected_license_expression for {object_count:,} resources.") @@ -29,7 +29,11 @@ def compute_resource_detected_license_expression(apps, schema_editor): unsaved_objects = [] for index, resource in enumerate(iterator, start=1): - combined_expression = str(combine_expressions(resource.license_expressions)) + combined_expression = combine_expressions(resource.license_expressions) + if not combined_expression: + print(f' invalid license expression for {resource.path}: {combined_expression}') + continue + combined_expression = str(combined_expression) # gpl-2.0 OR broadcom-linking-unmodified OR proprietary-license # build_spdx_license_expression("broadcom-linking-unmodified") # AttributeError: 'LicenseSymbol' object has no attribute 'wrapped' @@ -122,7 +126,7 @@ def compute_resource_license_detections(apps, schema_editor): From scancode.io """ Resource = apps.get_model("packagedb", "Resource") - resources = Resource.objects.filter(~Q(licenses=[]) | Q(licenses__isnull=False)).only('licenses') + resources = Resource.objects.filter(~Q(licenses=[])).filter(licenses__is_null=False) object_count = resources.count() print(f"\nCompute license_detections for {object_count:,} resources.") diff --git a/packagedb/migrations/0070_auto_20230706_0045.py b/packagedb/migrations/0070_auto_20230706_0045.py index 9d18cbdd..d9fa116a 100644 --- a/packagedb/migrations/0070_auto_20230706_0045.py +++ b/packagedb/migrations/0070_auto_20230706_0045.py @@ -66,8 +66,11 @@ def create_maven_package_sets(apps, schema_editor): "version", "qualifiers", "subpath", - ).iterator( - chunk_size=5000 + ) + package_count = maven_packages_without_package_set.count() + chunk_size = 2000 + iterator = maven_packages_without_package_set.iterator( + chunk_size=chunk_size ) prev_namespace = None @@ -75,7 +78,17 @@ def create_maven_package_sets(apps, schema_editor): prev_version = None prev_package = None unupdated_packages = [] - for package in maven_packages_without_package_set: + for i, package in enumerate(iterator): + if not (i % chunk_size) and unupdated_packages: + Package.objects.bulk_update( + objs=unupdated_packages, + fields=[ + "package_content", + ] + ) + unupdated_packages = [] + print(f" {i:,} / {package_count:,} updated") + if "source" in package.qualifiers: package_content = PackageContentType.SOURCE_ARCHIVE else: diff --git a/packagedb/migrations/0078_alter_package_release_date.py b/packagedb/migrations/0078_alter_package_release_date.py new file mode 100644 index 00000000..b33739fa --- /dev/null +++ b/packagedb/migrations/0078_alter_package_release_date.py @@ -0,0 +1,22 @@ +# Generated by Django 4.1.2 on 2023-09-29 21:24 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("packagedb", "0077_remove_package_declared_license_expression_spdx_and_more"), + ] + + operations = [ + migrations.AlterField( + model_name="package", + name="release_date", + field=models.DateTimeField( + blank=True, + db_index=True, + help_text="The date and time that the package file was created, or when it was posted to its original download source.", + null=True, + ), + ), + ] diff --git a/packagedb/models.py b/packagedb/models.py index e4d7c58a..23588273 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -254,12 +254,12 @@ class AbstractPackage(models.Model): "By convention the first line should be a summary when available." ), ) - release_date = models.DateField( + release_date = models.DateTimeField( blank=True, null=True, db_index=True, help_text=_( - "The date that the package file was created, or when " + "The date and time that the package file was created, or when " "it was posted to its original download source." ), )