diff --git a/matchcode/api.py b/matchcode/api.py
index 092933e3..68844b8d 100644
--- a/matchcode/api.py
+++ b/matchcode/api.py
@@ -15,6 +15,7 @@
from rest_framework.decorators import action
from rest_framework.response import Response
from rest_framework.serializers import CharField
+from rest_framework.serializers import FloatField
from rest_framework.serializers import HyperlinkedRelatedField
from rest_framework.serializers import ModelSerializer
from rest_framework.serializers import ReadOnlyField
@@ -24,6 +25,7 @@
from matchcode_toolkit.fingerprinting import create_halohash_chunks
from matchcode_toolkit.fingerprinting import hexstring_to_binarray
from matchcode_toolkit.fingerprinting import split_fingerprint
+from matchcode_toolkit.halohash import byte_hamming_distance
from matchcode.models import ExactFileIndex
from matchcode.models import ExactPackageArchiveIndex
from matchcode.models import ApproximateDirectoryContentIndex
@@ -91,6 +93,7 @@ class BaseDirectoryIndexMatchSerializer(Serializer):
lookup_field='uuid',
read_only=True
)
+ similarity_score = FloatField()
class CharMultipleWidget(widgets.TextInput):
@@ -271,11 +274,18 @@ def match(self, request):
for fingerprint in unique_fingerprints:
matches = model_class.match(fingerprint)
for match in matches:
+ _, bah128 = split_fingerprint(fingerprint)
+ # Get fingerprint from the match
+ fp = match.fingerprint()
+ _, match_bah128 = split_fingerprint(fp)
+ hd = byte_hamming_distance(bah128, match_bah128)
+ similarity_score = (128 - hd) / 128
results.append(
{
'fingerprint': fingerprint,
- 'matched_fingerprint': match.fingerprint(),
+ 'matched_fingerprint': fp,
'package': match.package,
+ 'similarity_score': similarity_score,
}
)
diff --git a/matchcode/tests/test_api.py b/matchcode/tests/test_api.py
index be971081..8decc568 100644
--- a/matchcode/tests/test_api.py
+++ b/matchcode/tests/test_api.py
@@ -117,6 +117,7 @@ def test_api_approximate_directory_content_index_match_close_match(self):
self.assertEqual(expected_matched_fingerprint, result['matched_fingerprint'])
expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package1.uuid])
self.assertEqual(expected_package, result['package'])
+ self.assertEqual(0.9453125, result['similarity_score'])
def test_api_approximate_directory_structure_index_match_close_match(self):
# This test fingerprint has a hamming distance of 7 from the expected fingerprint
@@ -133,6 +134,7 @@ def test_api_approximate_directory_structure_index_match_close_match(self):
self.assertEqual(expected_matched_fingerprint, result['matched_fingerprint'])
expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package2.uuid])
self.assertEqual(expected_package, result['package'])
+ self.assertEqual(0.9453125, result['similarity_score'])
def test_api_approximate_directory_content_index_match(self):
test_fingerprint = '00000007af7d63765c78fa516b5353f5ffa7df45'
@@ -147,6 +149,7 @@ def test_api_approximate_directory_content_index_match(self):
self.assertEqual(test_fingerprint, result['matched_fingerprint'])
expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package1.uuid])
self.assertEqual(expected_package, result['package'])
+ self.assertEqual(1.0, result['similarity_score'])
def test_api_approximate_directory_structure_index_match(self):
test_fingerprint = '00000004d10982208810240820080a6a3e852486'
@@ -161,3 +164,4 @@ def test_api_approximate_directory_structure_index_match(self):
self.assertEqual(test_fingerprint, result['matched_fingerprint'])
expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package2.uuid])
self.assertEqual(expected_package, result['package'])
+ self.assertEqual(1.0, result['similarity_score'])
diff --git a/minecode/management/commands/get_maven_release_dates.py b/minecode/management/commands/get_maven_release_dates.py
new file mode 100644
index 00000000..c120b67e
--- /dev/null
+++ b/minecode/management/commands/get_maven_release_dates.py
@@ -0,0 +1,75 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# purldb is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/nexB/purldb for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+from dateutil.parser import parse as dateutil_parse
+from os.path import dirname
+import logging
+import sys
+
+import requests
+
+from minecode.management.commands import VerboseCommand
+from minecode.visitors.maven import collect_links_from_text
+from minecode.visitors.maven import filter_for_artifacts
+from packagedb.models import Package
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout)
+logger.setLevel(logging.INFO)
+
+TRACE = False
+if TRACE:
+ logger.setLevel(logging.DEBUG)
+
+
+class Command(VerboseCommand):
+ help = 'Get and set release_date for Maven Packages'
+
+ def handle(self, *args, **options):
+ queryset = Package.objects.filter(
+ type='maven',
+ release_date=None,
+ download_url__startswith='https://repo1.maven.org/maven2'
+ )
+ object_count = queryset.count()
+ chunk_size = 2000
+ iterator = queryset.iterator(chunk_size=chunk_size)
+ unsaved_objects = []
+
+ logger.info(f'Updating release_date for {object_count} packages')
+ for index, package in enumerate(iterator, start=1):
+ download_url = package.download_url
+ package_url = package.package_url
+ logger.info(f'Updating release_date for package {package_url} ({download_url})')
+ package_version_page_url = dirname(download_url)
+ filename = download_url.rsplit('/')[-1]
+ response = requests.get(package_version_page_url)
+ if response:
+ timestamps_by_links = collect_links_from_text(response.text, filter=filter_for_artifacts)
+ timestamp = timestamps_by_links.get(filename)
+ if not timestamp:
+ logger.info(f'\tCould not get release_date for package {package_url} ({download_url})')
+ continue
+ timestamp = dateutil_parse(timestamp)
+ package.release_date = timestamp
+ unsaved_objects.append(package)
+ logger.info(f'\t{package_url} ({download_url}) release_date has been updated to {timestamp}')
+ else:
+ logger.info(f'\t{package_url} not updated: error encountered when visiting {package_version_page_url}')
+ if not (index % chunk_size) and unsaved_objects:
+ logger.info(f'{index:,} / {object_count:,} Packages processed')
+
+ logger.info('Updating Package objects...')
+ updated_packages_count = Package.objects.bulk_update(
+ objs=unsaved_objects,
+ fields=['release_date'],
+ batch_size=1000,
+ )
+ logger.info(f'Updated {updated_packages_count} Package objects')
diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py
new file mode 100644
index 00000000..55862921
--- /dev/null
+++ b/minecode/management/commands/import_queue.py
@@ -0,0 +1,167 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# purldb is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/nexB/purldb for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+from dateutil.parser import parse as dateutil_parse
+import logging
+import signal
+import sys
+import time
+
+import requests
+
+from django.db import transaction
+from django.utils import timezone
+from packageurl import PackageURL
+
+from minecode.management.commands import get_error_message
+from minecode.management.commands import VerboseCommand
+from minecode.models import ImportableURI
+from minecode.visitors.maven import get_artifact_links
+from minecode.visitors.maven import get_classifier_from_artifact_url
+from minecode.visitors.maven import collect_links_from_text
+from minecode.visitors.maven import filter_only_directories
+from minecode.visitors.maven import get_artifact_sha1
+from minecode.model_utils import merge_or_create_package
+from packagedcode.models import PackageData
+from packagedb.models import Package
+from minecode.visitors.maven import determine_namespace_name_version_from_url
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout)
+logger.setLevel(logging.INFO)
+
+TRACE = False
+if TRACE:
+ logger.setLevel(logging.DEBUG)
+
+# sleep duration in seconds when the queue is empty
+SLEEP_WHEN_EMPTY = 10
+
+MUST_STOP = False
+
+
+def stop_handler(*args, **kwargs):
+ """
+ Signal handler to set global variable to True.
+ """
+ global MUST_STOP
+ MUST_STOP = True
+
+
+signal.signal(signal.SIGTERM, stop_handler)
+
+
+class Command(VerboseCommand):
+ help = 'Run a Package request queue.'
+
+ def handle(self, *args, **options):
+ """
+ Get the next processable PriorityResourceURI and start the
+ processing. Loops forever and sleeps a short while if there are
+ no PriorityResourceURI left to process.
+ """
+
+ global MUST_STOP
+
+ sleeping = False
+ processed_counter = 0
+
+ while True:
+ if MUST_STOP:
+ logger.info('Graceful exit of the request queue.')
+ break
+
+ with transaction.atomic():
+ importable_uri = ImportableURI.objects.get_next_request()
+
+ if not importable_uri:
+ # Only log a single message when we go to sleep
+ if not sleeping:
+ sleeping = True
+ logger.info('No more processable request, sleeping...')
+
+ time.sleep(SLEEP_WHEN_EMPTY)
+ continue
+
+ sleeping = False
+
+ # process request
+ logger.info('Processing {}'.format(importable_uri))
+ try:
+ errors = process_request(importable_uri)
+ except Exception as e:
+ errors = 'Error: Failed to process ImportableURI: {}\n'.format(
+ repr(importable_uri))
+ errors += get_error_message(e)
+ finally:
+ if errors:
+ importable_uri.processing_error = errors
+ logger.error(errors)
+ importable_uri.processed_date = timezone.now()
+ importable_uri.wip_date = None
+ importable_uri.save()
+ processed_counter += 1
+
+ return processed_counter
+
+
+def process_request(importable_uri):
+ uri = importable_uri.uri
+ uri = uri.rstrip('/')
+ data = importable_uri.data
+ if not data:
+ # collect data again if we don't have it
+ response = requests.get(uri)
+ if response:
+ data = requests.text
+
+ purl = importable_uri.package_url
+ if purl:
+ package_url = PackageURL.from_string(purl)
+ namespace = package_url.namespace
+ name = package_url.name
+ else:
+ namespace, name, _ = determine_namespace_name_version_from_url(uri)
+
+ timestamps_by_directory_links = collect_links_from_text(data, filter_only_directories)
+ # Go into each version directory
+ for directory_link in timestamps_by_directory_links.keys():
+ version = directory_link.rstrip('/')
+ version_page_url = f'{uri}/{version}'
+ timestamps_by_artifact_links = get_artifact_links(version_page_url)
+ for artifact_link, timestamp in timestamps_by_artifact_links.items():
+ sha1 = get_artifact_sha1(artifact_link)
+ classifier = get_classifier_from_artifact_url(artifact_link, version_page_url, name, version)
+ qualifiers = None
+ if classifier:
+ qualifiers = f'classifier={classifier}'
+ release_date = dateutil_parse(timestamp)
+ package_data = PackageData(
+ type='maven',
+ namespace=namespace,
+ name=name,
+ version=version,
+ qualifiers=qualifiers,
+ download_url=artifact_link,
+ sha1=sha1,
+ release_date=release_date,
+ )
+ package, created, merged, map_error = merge_or_create_package(
+ scanned_package=package_data,
+ visit_level=50
+ )
+ if created:
+ logger.info(f'Created package {package}')
+ if merged:
+ logger.info(f'Updated package {package}')
+ if map_error:
+ logger.error(f'Error encountered: {map_error}')
+ importable_uri.processing_error = map_error
+ importable_uri.save()
diff --git a/minecode/management/commands/maven_crawler.py b/minecode/management/commands/maven_crawler.py
new file mode 100644
index 00000000..30c8f360
--- /dev/null
+++ b/minecode/management/commands/maven_crawler.py
@@ -0,0 +1,31 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# purldb is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/nexB/purldb for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+import logging
+import sys
+
+from minecode.management.commands import VerboseCommand
+from minecode.visitors.maven import crawl_maven_repo_from_root
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout)
+logger.setLevel(logging.INFO)
+
+TRACE = False
+if TRACE:
+ logger.setLevel(logging.DEBUG)
+
+
+class Command(VerboseCommand):
+ help = 'Run a Package request queue.'
+
+ def handle(self, *args, **options):
+ maven_root_url = 'https://repo.maven.apache.org/maven2'
+ crawl_maven_repo_from_root(root_url=maven_root_url)
diff --git a/minecode/migrations/0031_importableuri.py b/minecode/migrations/0031_importableuri.py
new file mode 100644
index 00000000..0d557312
--- /dev/null
+++ b/minecode/migrations/0031_importableuri.py
@@ -0,0 +1,181 @@
+# Generated by Django 4.1.2 on 2023-09-12 00:14
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("minecode", "0030_scannableuri_rescan_alter_scannableuri_scan_status"),
+ ]
+
+ operations = [
+ migrations.CreateModel(
+ name="ImportableURI",
+ fields=[
+ (
+ "id",
+ models.AutoField(
+ auto_created=True,
+ primary_key=True,
+ serialize=False,
+ verbose_name="ID",
+ ),
+ ),
+ (
+ "uri",
+ models.CharField(
+ db_index=True,
+ help_text="URI for this resource. This is the unmodified original URI.",
+ max_length=2048,
+ ),
+ ),
+ (
+ "canonical",
+ models.CharField(
+ db_index=True,
+ help_text="Canonical form of the URI for this resource that must be unique across all ResourceURI.",
+ max_length=3000,
+ ),
+ ),
+ (
+ "source_uri",
+ models.CharField(
+ blank=True,
+ help_text="Optional: real source remote URI for this visit.For example for a package repository index is a typical source via which a first level of package data is fetched. And it is not the URI in the uri field. It is just the source of the fetchOr the source may be a mirror URI used for fetching.",
+ max_length=2048,
+ null=True,
+ ),
+ ),
+ (
+ "priority",
+ models.PositiveIntegerField(
+ db_index=True,
+ default=0,
+ help_text="Absolute procdssing priority of a URI (default to zero), higher number means higher priority, zero means lowest priority.",
+ ),
+ ),
+ (
+ "wip_date",
+ models.DateTimeField(
+ blank=True,
+ db_index=True,
+ help_text="Work In Progress. This is a timestamp set at the start of a visit or mapping or indexing or null when no processing is in progress.",
+ null=True,
+ ),
+ ),
+ (
+ "file_name",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ help_text="File name of a resource sometimes part of the URI proper and sometimes only available through an HTTP header.",
+ max_length=255,
+ null=True,
+ ),
+ ),
+ (
+ "size",
+ models.PositiveIntegerField(
+ blank=True,
+ db_index=True,
+ help_text="Size in bytes of the file represented by this ResourceURI.",
+ null=True,
+ ),
+ ),
+ (
+ "sha1",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ help_text="SHA1 checksum hex-encoded (as in the sha1sum command) of the content of the file represented by this ResourceURI.",
+ max_length=40,
+ null=True,
+ ),
+ ),
+ (
+ "md5",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ help_text="MD5 checksum hex-encoded (as in the md5sum command) of the content of the file represented by this ResourceURI.",
+ max_length=32,
+ null=True,
+ ),
+ ),
+ (
+ "sha256",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ help_text="SHA256 checksum hex-encoded (as in the sha256sum command) of the content of the file represented by this ResourceURI.",
+ max_length=64,
+ null=True,
+ ),
+ ),
+ (
+ "last_modified_date",
+ models.DateTimeField(
+ blank=True,
+ db_index=True,
+ help_text="Timestamp set to the last modified date of the remote resource represented by this URI such as the modified date of a file, the lastmod value on a sitemap or the modified date returned by an HTTP resource.",
+ null=True,
+ ),
+ ),
+ (
+ "package_url",
+ models.CharField(
+ blank=True,
+ db_index=True,
+ help_text='Package URL for this resource. It stands for a package "mostly universal" URL.',
+ max_length=2048,
+ null=True,
+ ),
+ ),
+ (
+ "data",
+ models.TextField(
+ blank=True,
+ help_text="Text content of the file represented by this ResourceURI. This contains the data that was fetched or extracted from a remote ResourceURI such as HTML or JSON.",
+ null=True,
+ ),
+ ),
+ (
+ "request_date",
+ models.DateTimeField(
+ blank=True,
+ db_index=True,
+ help_text="Timestamp set to the date of when this Package info was requested.",
+ null=True,
+ ),
+ ),
+ (
+ "processed_date",
+ models.DateTimeField(
+ blank=True,
+ db_index=True,
+ help_text="Timestamp set to the date of when this Package info was processed.",
+ null=True,
+ ),
+ ),
+ (
+ "has_processing_error",
+ models.BooleanField(
+ db_index=True,
+ default=False,
+ help_text="When set to True (Yes), this field indicates that an error has occured when processing this URI.",
+ ),
+ ),
+ (
+ "processing_error",
+ models.TextField(
+ blank=True,
+ help_text="Processing errors messages. When present this means the processing failed.",
+ null=True,
+ ),
+ ),
+ ],
+ options={
+ "verbose_name": "Importable URI",
+ },
+ ),
+ ]
diff --git a/minecode/models.py b/minecode/models.py
index 16c44854..3a6f046e 100644
--- a/minecode/models.py
+++ b/minecode/models.py
@@ -31,10 +31,6 @@
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.INFO)
-# logger = logging.getLogger(__name__)
-# handler = logging.StreamHandler()
-# logger.addHandler(handler)
-
def get_canonical(uri):
"""
@@ -936,3 +932,134 @@ def save(self, *args, **kwargs):
"""
self.normalize_fields()
super(PriorityResourceURI, self).save(*args, **kwargs)
+
+
+# TODO: Use the QuerySet.as_manager() for more flexibility and chaining.
+class ImportableURIManager(models.Manager):
+ def insert(self, uri, data, package_url, **extra_fields):
+ """
+ Create and return a new ImportableURI
+ Return None if the insertion failed when the same URI exists with the same versions to be collected
+ """
+ # TODO: be able to create a request for an existing purl if the previous request has been completed already
+
+ importable_uris = self.filter(
+ uri=uri,
+ **extra_fields
+ )
+ if (
+ importable_uris.count() == 0
+ or all(p.processed_date for p in importable_uris)
+ ):
+ importable_uri = self.create(
+ uri=uri,
+ data=data,
+ package_url=package_url,
+ **extra_fields
+ )
+ return importable_uri
+
+ def in_progress(self):
+ """
+ Limit the QuerySet to ImportableURI being processed.
+ """
+ return self.filter(wip_date__isnull=False)
+
+ def never_processed(self):
+ """
+ Limit the QuerySet to ImportableURIs that have never been processed.
+ This is usually the state of a ImportableURI after upon creation.
+ """
+ return self.filter(
+ processed_date__isnull=True,
+ wip_date__isnull=True
+ ).order_by(
+ 'request_date'
+ )
+
+ def get_requests(self):
+ """
+ Return an ordered query set of all processable ImportableURIs.
+ """
+ never_processed = self.never_processed()
+ return never_processed
+
+ def get_next_request(self):
+ """
+ Return the next ImportableURI request for processing and mark it
+ as being "in_progress" by setting the wip_date field.
+
+ Return None when there is no request left to visit.
+
+ NOTE: this method can only be called from within a transaction.atomic
+ block.
+ """
+ importable_uri = self.get_requests().select_for_update(skip_locked=True).first()
+ if not importable_uri:
+ return
+ importable_uri.wip_date = timezone.now()
+ importable_uri.save(update_fields=['wip_date'])
+ return importable_uri
+
+
+# TODO: have a second queue for crawling maven repo, that tracks which pages and namespaces we visited
+# when we hit the point of a package page, we add it to the queue that creates skinny packages for the package we visited.
+
+class ImportableURI(BaseURI):
+ package_url = models.CharField(
+ max_length=2048,
+ null=True,
+ blank=True,
+ db_index=True,
+ help_text="""Package URL for this resource. It stands for a package "mostly universal" URL."""
+ )
+
+ # This is a text blob that contains either HTML, JSON or anything
+ # stored as a string. This is the raw content of visiting a URI.
+ # NOTE: some visited URLS (such as an actual package archive will/shoud NOT be stored there)
+ data = models.TextField(
+ null=True,
+ blank=True,
+ help_text='Text content of the file represented by this '
+ 'ResourceURI. This contains the data that was fetched or '
+ 'extracted from a remote ResourceURI such as HTML or JSON.',
+ )
+
+ request_date = models.DateTimeField(
+ null=True,
+ blank=True,
+ db_index=True,
+ help_text='Timestamp set to the date of when this Package info was requested.',
+ )
+
+ processed_date = models.DateTimeField(
+ null=True,
+ blank=True,
+ db_index=True,
+ help_text='Timestamp set to the date of when this Package info was processed.',
+ )
+
+ has_processing_error = models.BooleanField(
+ db_index=True,
+ default=False,
+ help_text='When set to True (Yes), this field indicates that '
+ 'an error has occured when processing this URI.'
+ )
+
+ processing_error = models.TextField(
+ null=True,
+ blank=True,
+ help_text='Processing errors messages. When present this means the processing failed.',
+ )
+
+ objects = ImportableURIManager()
+
+ class Meta:
+ verbose_name = 'Importable URI'
+
+ def save(self, *args, **kwargs):
+ """
+ Save, adding defaults for computed fields and validating fields.
+ """
+ self.normalize_fields()
+ super(ImportableURI, self).save(*args, **kwargs)
diff --git a/minecode/tests/test_maven.py b/minecode/tests/test_maven.py
index 8584f49d..2375f20b 100644
--- a/minecode/tests/test_maven.py
+++ b/minecode/tests/test_maven.py
@@ -868,3 +868,315 @@ def test_get_merged_ancestor_package_from_maven_package(self, get_pom_text_mock,
merged_package = maven_visitor.get_merged_ancestor_package_from_maven_package(package=db_package)
expected_loc = self.get_test_loc('maven/pom/pulsar-client-merged-ancestor-package.json')
self.check_expected_results(merged_package.to_dict(), expected_loc, regen=regen)
+
+
+class MavenCrawlerFunctionsTest(JsonBasedTesting, DjangoTestCase):
+ test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles')
+
+ def test_check_if_file_name_is_linked_on_page(self):
+ links = ['foo/', 'bar/', 'baz/']
+ self.assertTrue(
+ maven_visitor.check_if_file_name_is_linked_on_page('foo/', links)
+ )
+ self.assertFalse(
+ maven_visitor.check_if_file_name_is_linked_on_page('qux/', links)
+ )
+
+ def test_check_if_page_has_pom_files(self):
+ links1 = ['foo/', 'bar.jar', 'bar.pom']
+ links2 = ['foo/', 'bar.jar']
+ self.assertTrue(maven_visitor.check_if_page_has_pom_files(links1))
+ self.assertFalse(maven_visitor.check_if_page_has_pom_files(links2))
+
+ def test_check_if_page_has_directories(self):
+ links1 = ['foo/', 'bar/', 'baz/']
+ links2 = ['../', 'bar.pom', 'bar.jar']
+ self.assertTrue(maven_visitor.check_if_page_has_directories(links1))
+ self.assertFalse(maven_visitor.check_if_page_has_directories(links2))
+
+ def test_check_if_package_version_page(self):
+ links1 = ['../', 'bar.pom', 'bar.jar']
+ links2 = ['../', 'foo/', 'bar/', 'baz/']
+ self.assertTrue(maven_visitor.check_if_package_version_page(links1))
+ self.assertFalse(maven_visitor.check_if_package_version_page(links2))
+
+ def test_check_if_package_page(self):
+ links1 = ['../', 'maven-metadata.xml']
+ links2 = ['../', 'bar.pom', 'bar.jar']
+ self.assertTrue(maven_visitor.check_if_package_page(links1))
+ self.assertFalse(maven_visitor.check_if_package_page(links2))
+
+ def test_check_if_maven_root(self):
+ links1 = ['../', 'archetype-catalog.xml']
+ links2 = ['../', 'bar.pom', 'bar.jar']
+ self.assertTrue(maven_visitor.check_if_maven_root(links1))
+ self.assertFalse(maven_visitor.check_if_maven_root(links2))
+
+ @mock.patch('requests.get')
+ def test_check_on_page(self, mock_request_get):
+ checker = maven_visitor.check_if_page_has_pom_files
+ mock_request_get.return_value.ok = True
+ mock_request_get.return_value.text = 'parent-7.11.0.pom'
+ self.assertTrue(maven_visitor.check_on_page('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/', checker))
+
+ @mock.patch('requests.get')
+ def test_is_maven_root(self, mock_request_get):
+ mock_request_get.return_value.ok = True
+ mock_request_get.return_value.text = 'archetype-catalog.xml'
+ self.assertTrue(maven_visitor.is_maven_root('https://repo1.maven.org/maven2/'))
+
+ @mock.patch('requests.get')
+ def test_is_package_page(self, mock_request_get):
+ mock_request_get.return_value.ok = True
+ mock_request_get.return_value.text = 'maven-metadata.xml'
+ self.assertTrue(maven_visitor.is_package_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/'))
+
+ @mock.patch('requests.get')
+ def test_is_package_version_page(self, mock_request_get):
+ mock_request_get.return_value.ok = True
+ mock_request_get.return_value.text = '''
+ ../
+ parent-7.11.0.pom
+ '''
+ self.assertTrue(maven_visitor.is_package_version_page('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/'))
+
+ def test_url_parts(self):
+ url = 'https://example.com/foo/bar/baz.jar'
+ scheme, netloc, path_segments = maven_visitor.url_parts(url)
+ self.assertEqual('https', scheme)
+ self.assertEqual('example.com', netloc)
+ self.assertEquals(['foo', 'bar', 'baz.jar'], path_segments)
+
+ def test_create_url(self):
+ scheme = 'https'
+ netloc = 'example.com'
+ path_segments = ['foo', 'bar', 'baz.jar']
+ url = 'https://example.com/foo/bar/baz.jar'
+ self.assertEqual(
+ url,
+ maven_visitor.create_url(scheme, netloc, path_segments)
+ )
+
+ @mock.patch('requests.get')
+ def test_get_maven_root(self, mock_request_get):
+ mock_request_get.return_value.ok = True
+ mock_request_get.return_value.text = 'archetype-catalog.xml'
+ self.assertEqual(
+ 'https://repo1.maven.org/maven2',
+ maven_visitor.get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/')
+ )
+
+ @mock.patch('requests.get')
+ def test_determine_namespace_name_version_from_url(self, mock_request_get):
+ url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2'
+ root_url = 'https://repo1.maven.org/maven2'
+
+ package_page_text = '''
+ 1.0.b2/
+ 2005-09-20 05:53 -
+ maven-metadata.xml
+ 2012-06-26 17:01 567
+ '''
+ package_page = mock.Mock(ok=True, text=package_page_text)
+
+ package_version_page_text = '''
+ ../ -
+ xml-apis-1.0.b2.pom
+ 2005-09-20 05:53 2249
+ '''
+ package_version_page = mock.Mock(ok=True, text=package_version_page_text)
+ mock_request_get.side_effect = [
+ mock.Mock(ok=True, text=''),
+ mock.Mock(ok=True, text=''),
+ package_page,
+ mock.Mock(ok=True, text=''),
+ package_version_page
+ ]
+
+ namespace, package_name, package_version = maven_visitor.determine_namespace_name_version_from_url(url, root_url)
+ self.assertEqual('xml-apis', namespace)
+ self.assertEqual('xml-apis', package_name)
+ self.assertEqual('1.0.b2', package_version)
+
+ @mock.patch('requests.get')
+ def test_add_to_import_queue(self, mock_request_get):
+ from minecode.models import ImportableURI
+
+ url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/'
+ root_url = 'https://repo1.maven.org/maven2'
+
+ package_page_text = '''
+ 1.0.b2/
+ 2005-09-20 05:53 -
+ maven-metadata.xml
+ 2012-06-26 17:01 567
+ '''
+ package_page = mock.Mock(ok=True, text=package_page_text)
+
+ package_version_page_text = '''
+ ../ -
+ xml-apis-1.0.b2.pom
+ 2005-09-20 05:53 2249
+ '''
+ package_version_page = mock.Mock(ok=True, text=package_version_page_text)
+ mock_request_get.side_effect = [
+ package_page,
+ mock.Mock(ok=True, text=''),
+ mock.Mock(ok=True, text=''),
+ package_page,
+ mock.Mock(ok=True, text=''),
+ package_version_page
+ ]
+
+ self.assertEqual(0, ImportableURI.objects.all().count())
+ maven_visitor.add_to_import_queue(url, root_url )
+ self.assertEqual(1, ImportableURI.objects.all().count())
+ importable_uri = ImportableURI.objects.get(uri=url)
+ self.assertEqual('pkg:maven/xml-apis/xml-apis', importable_uri.package_url)
+
+ def test_filter_only_directories(self):
+ timestamps_by_links = {
+ '../': '-',
+ 'foo/': '-',
+ 'foo.pom': '2023-09-28',
+ }
+ expected = {
+ 'foo/': '-',
+ }
+ self.assertEqual(
+ expected,
+ maven_visitor.filter_only_directories(timestamps_by_links)
+ )
+
+ def test_filter_for_artifacts(self):
+ timestamps_by_links = {
+ '../': '2023-09-28',
+ 'foo.pom': '2023-09-28',
+ 'foo.ejb3': '2023-09-28',
+ 'foo.ear': '2023-09-28',
+ 'foo.aar': '2023-09-28',
+ 'foo.apk': '2023-09-28',
+ 'foo.gem': '2023-09-28',
+ 'foo.jar': '2023-09-28',
+ 'foo.nar': '2023-09-28',
+ 'foo.so': '2023-09-28',
+ 'foo.swc': '2023-09-28',
+ 'foo.tar': '2023-09-28',
+ 'foo.tar.gz': '2023-09-28',
+ 'foo.war': '2023-09-28',
+ 'foo.xar': '2023-09-28',
+ 'foo.zip': '2023-09-28',
+ }
+ expected = {
+ 'foo.ejb3': '2023-09-28',
+ 'foo.ear': '2023-09-28',
+ 'foo.aar': '2023-09-28',
+ 'foo.apk': '2023-09-28',
+ 'foo.gem': '2023-09-28',
+ 'foo.jar': '2023-09-28',
+ 'foo.nar': '2023-09-28',
+ 'foo.so': '2023-09-28',
+ 'foo.swc': '2023-09-28',
+ 'foo.tar': '2023-09-28',
+ 'foo.tar.gz': '2023-09-28',
+ 'foo.war': '2023-09-28',
+ 'foo.xar': '2023-09-28',
+ 'foo.zip': '2023-09-28',
+ }
+ self.assertEqual(expected, maven_visitor.filter_for_artifacts(timestamps_by_links))
+
+ def test_collect_links_from_text(self):
+ filter = maven_visitor.filter_only_directories
+ text = '''
+ ../
+ 1.0.b2/
+ 2005-09-20 05:53 -
+ 1.2.01/
+ 2010-02-03 21:05 -
+ '''
+ expected = {
+ '1.0.b2/': '2005-09-20 05:53',
+ '1.2.01/': '2010-02-03 21:05'
+ }
+ self.assertEqual(
+ expected,
+ maven_visitor.collect_links_from_text(text, filter=filter)
+ )
+
+ def test_create_absolute_urls_for_links(self):
+ filter = maven_visitor.filter_only_directories
+ text = '''
+ ../
+ 1.0.b2/
+ 2005-09-20 05:53 -
+ 1.2.01/
+ 2010-02-03 21:05 -
+ '''
+ url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/'
+ expected = {
+ 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53',
+ 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05'
+ }
+ self.assertEqual(
+ expected,
+ maven_visitor.create_absolute_urls_for_links(text, url, filter=filter)
+ )
+
+ @mock.patch('requests.get')
+ def test_get_directory_links(self, mock_request_get):
+ mock_request_get.return_value.ok = True
+ mock_request_get.return_value.text = '''
+ ../
+ 1.0.b2/
+ 2005-09-20 05:53 -
+ 1.2.01/
+ 2010-02-03 21:05 -
+ '''
+ url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/'
+ expected = {
+ 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53',
+ 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05'
+ }
+ self.assertEqual(expected, maven_visitor.get_directory_links(url))
+
+ @mock.patch('requests.get')
+ def test_get_artifact_links(self, mock_request_get):
+ mock_request_get.return_value.ok = True
+ mock_request_get.return_value.text = '''
+ ../
+ xml-apis-1.0.b2.jar
+ 2005-09-20 05:53 109318
+ xml-apis-1.0.b2.pom
+ 2005-09-20 05:53 2249
+ '''
+ url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/'
+ expected = {
+ 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar': '2005-09-20 05:53',
+ }
+ self.assertEqual(expected, maven_visitor.get_artifact_links(url))
+
+ def test_crawl_to_package(self):
+ pass
+
+ def test_crawl_maven_repo_from_root(self):
+ pass
+
+ @mock.patch('requests.get')
+ def test_get_artifact_sha1(self, mock_request_get):
+ sha1 = '3136ca936f64c9d68529f048c2618bd356bf85c9'
+ mock_request_get.return_value.ok = True
+ mock_request_get.return_value.text = sha1
+ self.assertEqual(sha1, maven_visitor.get_artifact_sha1('https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar.sha1'))
+
+ def test_get_classifier_from_artifact_url(self):
+ artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar'
+ package_version_page_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/'
+ package_name = 'livereload-jvm'
+ package_version = '0.2.0'
+ classifier = maven_visitor.get_classifier_from_artifact_url(
+ artifact_url,
+ package_version_page_url,
+ package_name,
+ package_version
+ )
+ self.assertEqual('onejar', classifier)
diff --git a/minecode/tests/testfiles/maven/end2end/expected_mapped_packages.json b/minecode/tests/testfiles/maven/end2end/expected_mapped_packages.json
index 5dfd490a..8fdc7fae 100644
--- a/minecode/tests/testfiles/maven/end2end/expected_mapped_packages.json
+++ b/minecode/tests/testfiles/maven/end2end/expected_mapped_packages.json
@@ -9,7 +9,7 @@
"package_content":null,
"primary_language":null,
"description":"APIs that App Engine provides to you to build your application.",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -53,7 +53,7 @@
"package_content":null,
"primary_language":null,
"description":null,
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -97,7 +97,7 @@
"package_content":null,
"primary_language":null,
"description":null,
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -141,7 +141,7 @@
"package_content":null,
"primary_language":null,
"description":"Library which allows discovering classes at runtime",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -185,7 +185,7 @@
"package_content":null,
"primary_language":null,
"description":"Library which allows discovering classes at runtime",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -229,7 +229,7 @@
"package_content":null,
"primary_language":null,
"description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -273,7 +273,7 @@
"package_content":null,
"primary_language":null,
"description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -317,7 +317,7 @@
"package_content":null,
"primary_language":null,
"description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -361,7 +361,7 @@
"package_content":null,
"primary_language":null,
"description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -405,7 +405,7 @@
"package_content":null,
"primary_language":null,
"description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -449,7 +449,7 @@
"package_content":null,
"primary_language":null,
"description":"Google Collections Library is a suite of new collections and collection-related goodness for Java 5.0",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -493,7 +493,7 @@
"package_content":null,
"primary_language":null,
"description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -537,7 +537,7 @@
"package_content":null,
"primary_language":null,
"description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -581,7 +581,7 @@
"package_content":null,
"primary_language":null,
"description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -625,7 +625,7 @@
"package_content":null,
"primary_language":null,
"description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -669,7 +669,7 @@
"package_content":null,
"primary_language":null,
"description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -713,7 +713,7 @@
"package_content":null,
"primary_language":null,
"description":"Protocol Buffers are a way of encoding structured data in an efficient yet\n extensible format.",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -757,7 +757,7 @@
"package_content":null,
"primary_language":null,
"description":"The Social Graph Node Mapper is a community project to build a portable library to map social networking sites' URLs to and from a new canonical form (sgn:// URLs).",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
@@ -801,7 +801,7 @@
"package_content":null,
"primary_language":null,
"description":"The Social Graph Node Mapper is a community project to build a portable library to map social networking sites' URLs to and from a new canonical form (sgn:// URLs).",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
diff --git a/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json b/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json
index bc9d0ae4..a31a465f 100644
--- a/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json
+++ b/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json
@@ -9,7 +9,7 @@
"package_content":null,
"primary_language":null,
"description":"Common classes to make creating REST services more consistent.",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
diff --git a/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json b/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json
index bc9d0ae4..a31a465f 100644
--- a/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json
+++ b/minecode/tests/testfiles/maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json
@@ -9,7 +9,7 @@
"package_content":null,
"primary_language":null,
"description":"Common classes to make creating REST services more consistent.",
- "release_date":"2009-05-21",
+ "release_date":"2009-05-21T00:00:00Z",
"parties":[],
"keywords":[],
"homepage_url":null,
diff --git a/minecode/tests/testfiles/rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json b/minecode/tests/testfiles/rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json
index 3513944b..643c8892 100644
--- a/minecode/tests/testfiles/rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json
+++ b/minecode/tests/testfiles/rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json
@@ -9,7 +9,7 @@
"package_content":null,
"primary_language":null,
"description":"Get the vendored assets paths in gems.",
- "release_date":"2012-08-03",
+ "release_date":"2012-08-03T00:00:00Z",
"parties":[
{
"type":null,
diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py
index 7fd70ac7..55624772 100644
--- a/minecode/visitors/maven.py
+++ b/minecode/visitors/maven.py
@@ -13,7 +13,9 @@
import io
import json
import logging
+import re
from typing import Dict
+from urllib.parse import urlparse
import arrow
import requests
@@ -305,7 +307,7 @@ def map_maven_package(package_url, package_content):
ancestor_pom_texts=ancestor_pom_texts,
package=package
)
-
+
urls = get_urls(
namespace=package_url.namespace,
@@ -453,6 +455,344 @@ def process_request(purl_str):
return error
+collect_links = re.compile(r'href="([^"]+)"').findall
+collect_links_and_artifact_timestamps = re.compile(
+ r'\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)'
+).findall
+
+
+def check_if_file_name_is_linked_on_page(file_name, links, **kwargs):
+ """
+ Return True if `file_name` is in `links`
+ """
+ return any(l.endswith(file_name) for l in links)
+
+
+def check_if_page_has_pom_files(links, **kwargs):
+ """
+ Return True of any entry in `links` ends with .pom.
+ """
+ return any(l.endswith('.pom') for l in links)
+
+
+def check_if_page_has_directories(links, **kwargs):
+ """
+ Return True if any entry, excluding "../", ends with /.
+ """
+ return any(l.endswith('/') for l in links if l != '../')
+
+
+def check_if_package_version_page(links, **kwargs):
+ """
+ Return True if `links` contains pom files and has no directories
+ """
+ return (
+ check_if_page_has_pom_files(links=links)
+ and not check_if_page_has_directories(links=links)
+ )
+
+
+def check_if_package_page(links, **kwargs):
+ return (
+ check_if_file_name_is_linked_on_page(file_name='maven-metadata.xml', links=links)
+ and not check_if_page_has_pom_files(links=links)
+ )
+
+
+def check_if_maven_root(links, **kwargs):
+ """
+ Return True if "archetype-catalog.xml" is in `links`, as the root of a Maven
+ repo contains "archetype-catalog.xml".
+ """
+ return check_if_file_name_is_linked_on_page(file_name='archetype-catalog.xml', links=links)
+
+
+def check_on_page(url, checker):
+ """
+ Return True if there is a link on `url` that is the same as `file_name`,
+ False otherwise.
+ """
+ response = requests.get(url)
+ if response:
+ links = collect_links(response.text)
+ return checker(links=links)
+ return False
+
+
+def is_maven_root(url):
+ """
+ Return True if `url` is the root of a Maven repo, False otherwise.
+ """
+ return check_on_page(url, check_if_maven_root)
+
+
+def is_package_page(url):
+ """
+ Return True if `url` is a package page on a Maven repo, False otherwise.
+ """
+ return check_on_page(url, check_if_package_page)
+
+
+def is_package_version_page(url):
+ """
+ Return True if `url` is a package version page on a Maven repo, False otherwise.
+ """
+ return check_on_page(url, check_if_package_version_page)
+
+
+def url_parts(url):
+ parsed_url = urlparse(url)
+ scheme = parsed_url.scheme
+ netloc = parsed_url.netloc
+ path_segments = [p for p in parsed_url.path.split('/') if p]
+ return scheme, netloc, path_segments
+
+
+def create_url(scheme, netloc, path_segments):
+ url_template = f'{scheme}://{netloc}'
+ path = '/'.join(path_segments)
+ return f'{url_template}/{path}'
+
+
+def get_maven_root(url):
+ """
+ Given `url`, that is a URL to namespace, package, or artifact in a Maven
+ repo, return the URL to the root of that repo. If a Maven root cannot be
+ determined, return None.
+
+ >>> get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/')
+ 'https://repo1.maven.org/maven2'
+ """
+ scheme, netloc, path_segments = url_parts(url)
+ for i in range(len(path_segments)):
+ segments = path_segments[:i+1]
+ url_segment = create_url(scheme, netloc, segments)
+ if is_maven_root(url_segment):
+ return url_segment
+ return None
+
+
+def determine_namespace_name_version_from_url(url, root_url=None):
+ """
+ Return a 3-tuple containing strings of a Package namespace, name, and
+ version, determined from `url`, where `url` points to namespace, package,
+ specific package version, or artifact on a Maven repo.
+
+ Return None if a Maven root cannot be determined from `url`.
+
+ >>> determine_namespace_name_version_from_url('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/')
+ ('net.shibboleth', 'parent', '7.11.0')
+ """
+ if not root_url:
+ root_url = get_maven_root(url)
+ if not root_url:
+ raise Exception(f'Error: not a Maven repository: {url}')
+
+ _, remaining_path_segments = url.split(root_url)
+ remaining_path_segments = remaining_path_segments.split('/')
+ remaining_path_segments = [p for p in remaining_path_segments if p]
+
+ namespace_segments = []
+ package_name = ''
+ package_version = ''
+ for i in range(len(remaining_path_segments)):
+ segment = remaining_path_segments[i]
+ segments = remaining_path_segments[:i+1]
+ path = '/'.join(segments)
+ url_segment = f'{root_url}/{path}'
+ if is_package_page(url_segment):
+ package_name = segment
+ elif is_package_version_page(url_segment):
+ package_version = segment
+ else:
+ namespace_segments.append(segment)
+ namespace = '.'.join(namespace_segments)
+ return namespace, package_name, package_version
+
+
+def add_to_import_queue(url, root_url):
+ """
+ Create ImportableURI for the Maven repo package page at `url`.
+ """
+ from minecode.models import ImportableURI
+ data = None
+ response = requests.get(url)
+ if response:
+ data = response.text
+ namespace, name, _ = determine_namespace_name_version_from_url(url, root_url)
+ purl = PackageURL(
+ type='maven',
+ namespace=namespace,
+ name=name,
+ )
+ importable_uri = ImportableURI.objects.insert(url, data, purl)
+ if importable_uri:
+ logger.info(f'Inserted {url} into ImportableURI queue')
+
+
+def filter_only_directories(timestamps_by_links):
+ """
+ Given a mapping of `timestamps_by_links`, where the links are directory names (which end with `/`),
+ """
+ timestamps_by_links_filtered = {}
+ for link, timestamp in timestamps_by_links.items():
+ if link != '../' and link.endswith('/'):
+ timestamps_by_links_filtered[link] = timestamp
+ return timestamps_by_links_filtered
+
+
+valid_artifact_extensions = [
+ 'ejb3',
+ 'ear',
+ 'aar',
+ 'apk',
+ 'gem',
+ 'jar',
+ 'nar',
+ # 'pom',
+ 'so',
+ 'swc',
+ 'tar',
+ 'tar.gz',
+ 'war',
+ 'xar',
+ 'zip',
+]
+
+
+def filter_for_artifacts(timestamps_by_links):
+ """
+ Given a mapping of `timestamps_by_links`, where the links are the filenames
+ of Maven artifacts, return a mapping of filenames whose extension is in
+ `valid_artifact_extensions` and their timestamps.
+ """
+ timestamps_by_links_filtered = {}
+ for link, timestamp in timestamps_by_links.items():
+ for ext in valid_artifact_extensions:
+ if link.endswith(ext):
+ timestamps_by_links_filtered[link] = timestamp
+ return timestamps_by_links_filtered
+
+
+def collect_links_from_text(text, filter):
+ """
+ Return a mapping of link locations and their timestamps, given HTML `text`
+ content, that is filtered using `filter`.
+ """
+ links_and_timestamps = collect_links_and_artifact_timestamps(text)
+ timestamps_by_links = {}
+ for link, timestamp in links_and_timestamps:
+ if timestamp == '-':
+ timestamp = ''
+ timestamps_by_links[link] = timestamp
+
+ timestamps_by_links = filter(timestamps_by_links=timestamps_by_links)
+ return timestamps_by_links
+
+
+def create_absolute_urls_for_links(text, url, filter):
+ """
+ Given the `text` contents from `url`, return a mapping of absolute URLs to
+ links from `url` and their timestamps, that is then filtered by `filter`.
+ """
+ timestamps_by_absolute_links = {}
+ url = url.rstrip('/')
+ timestamps_by_links = collect_links_from_text(text, filter)
+ for link, timestamp in timestamps_by_links.items():
+ if not link.startswith(url):
+ link = f'{url}/{link}'
+ timestamps_by_absolute_links[link] = timestamp
+ return timestamps_by_absolute_links
+
+
+def get_directory_links(url):
+ """
+ Return a list of absolute directory URLs of the hyperlinks from `url`
+ """
+ timestamps_by_directory_links = {}
+ response = requests.get(url)
+ if response:
+ timestamps_by_directory_links = create_absolute_urls_for_links(
+ response.text,
+ url=url,
+ filter=filter_only_directories
+ )
+ return timestamps_by_directory_links
+
+
+def get_artifact_links(url):
+ """
+ Return a list of absolute directory URLs of the hyperlinks from `url`
+ """
+ timestamps_by_artifact_links = []
+ response = requests.get(url)
+ if response:
+ timestamps_by_artifact_links = create_absolute_urls_for_links(
+ response.text,
+ url=url,
+ filter=filter_for_artifacts
+ )
+ return timestamps_by_artifact_links
+
+
+def crawl_to_package(url, root_url):
+ """
+ Given a maven repo `url`,
+ """
+ if is_package_page(url):
+ add_to_import_queue(url, root_url)
+ return
+
+ for link in get_directory_links(url):
+ crawl_to_package(link, root_url)
+
+
+def crawl_maven_repo_from_root(root_url):
+ """
+ Given the `url` to a maven root, traverse the repo depth-first and add
+ packages to the import queue.
+ """
+ crawl_to_package(root_url, root_url)
+
+
+def get_artifact_sha1(artifact_url):
+ """
+ Return the SHA1 value of the Maven artifact located at `artifact_url`.
+ """
+ sha1 = None
+ artifact_sha1_url = f'{artifact_url}.sha1'
+ response = requests.get(artifact_sha1_url)
+ if response:
+ sha1_contents = response.text.strip().split()
+ sha1 = sha1_contents[0]
+ sha1 = validate_sha1(sha1)
+ return sha1
+
+
+def get_classifier_from_artifact_url(artifact_url, package_version_page_url, package_name, package_version):
+ """
+ Return the classifier from a Maven artifact URL `artifact_url`, otherwise
+ return None if a classifier cannot be determined from `artifact_url`
+ """
+ classifier = None
+ # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0
+ package_version_page_url = package_version_page_url.rstrip('/')
+ # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0
+ leading_url_portion = f'{package_version_page_url}/{package_name}-{package_version}'
+ # artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar'
+ # ['', '-onejar.jar']
+ _, remaining_url_portion = artifact_url.split(leading_url_portion)
+ # ['-onejar', 'jar']
+ remaining_url_portions = remaining_url_portion.split('.')
+ if remaining_url_portions and remaining_url_portions[0]:
+ # '-onejar'
+ classifier = remaining_url_portions[0]
+ if classifier.startswith('-'):
+ # 'onejar'
+ classifier = classifier[1:]
+ return classifier
+
+
@visit_router.route('http://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties')
@visit_router.route('https://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties')
class MavenNexusPropertiesVisitor(NonPersistentHttpVisitor):
diff --git a/packagedb/api.py b/packagedb/api.py
index d8d7f59f..3dd17d7e 100644
--- a/packagedb/api.py
+++ b/packagedb/api.py
@@ -9,7 +9,9 @@
import logging
from django.core.exceptions import ValidationError
+from django.db.models import OuterRef
from django.db.models import Q
+from django.db.models import Subquery
from django_filters.rest_framework import FilterSet
from django_filters.filters import Filter
from django_filters.filters import OrderingFilter
@@ -565,12 +567,20 @@ def filter_by_checksums(self, request, *args, **kwargs):
lookups = Q()
for field, value in data.items():
+ # Subquery to get the ids of the Packages with the earliest release_date for each `field`
+ earliest_release_dates = Package.objects.filter(
+ **{field: OuterRef(field)}
+ ).order_by('release_date').values('id')[:1]
+
value = value or []
- # We create this intermediate dictionary so we can modify the field
- # name to have __in at the end
- d = {f'{field}__in': value}
- lookups |= Q(**d)
+ lookups |= Q(
+ **{
+ f'{field}__in': value,
+ 'id__in': Subquery(earliest_release_dates),
+ }
+ )
+ # Query to get the full Package objects with the earliest release_date for each sha1
qs = Package.objects.filter(lookups)
paginated_qs = self.paginate_queryset(qs)
if enhance_package_data:
@@ -803,7 +813,7 @@ def get_all_versions(purl: PackageURL):
except InvalidVersion:
logger.warning(f"Invalid version '{package_version.value}' for '{purl}'")
pass
-
+
return result
diff --git a/packagedb/migrations/0047_add_search_vector_field_to_package.py b/packagedb/migrations/0047_add_search_vector_field_to_package.py
index c2687a27..9eccd785 100644
--- a/packagedb/migrations/0047_add_search_vector_field_to_package.py
+++ b/packagedb/migrations/0047_add_search_vector_field_to_package.py
@@ -1,6 +1,6 @@
# Generated by Django 3.1.5 on 2021-03-10 19:04
-import django.contrib.postgres.search
+from django.contrib.postgres.search import SearchVector, SearchVectorField
from django.db import migrations
@@ -9,10 +9,26 @@ def populate_search_vector_field(apps, schema_editor):
Data migration used to lowercase any purl field values that currently exist.
"""
Package = apps.get_model('packagedb', 'Package')
-
- for pkg in Package.objects.iterator():
- pkg.search_vector = search.SearchVector('namespace', 'name', 'version', 'download_url')
- pkg.save()
+ resource_uris = Package.objects.iterator(chunk_size=5000)
+ updated = []
+ for i, package in enumerate(resource_uris):
+ if not i % 5000:
+ Package.objects.bulk_update(
+ objs=updated,
+ fields=[
+ 'search_vector',
+ ]
+ )
+ updated = []
+ package.search_vector = SearchVector('namespace', 'name', 'version', 'download_url')
+ updated.append(package)
+ if updated:
+ Package.objects.bulk_update(
+ objs=updated,
+ fields=[
+ 'search_vector',
+ ]
+ )
class Migration(migrations.Migration):
@@ -25,7 +41,7 @@ class Migration(migrations.Migration):
migrations.AddField(
model_name='package',
name='search_vector',
- field=django.contrib.postgres.search.SearchVectorField(null=True),
+ field=SearchVectorField(null=True),
),
migrations.RunPython(populate_search_vector_field),
]
diff --git a/packagedb/migrations/0059_compute_package_license_data.py b/packagedb/migrations/0059_compute_package_license_data.py
index c57d14e9..109c4254 100644
--- a/packagedb/migrations/0059_compute_package_license_data.py
+++ b/packagedb/migrations/0059_compute_package_license_data.py
@@ -9,18 +9,51 @@ def compute_package_declared_license_expression_spdx(apps, schema_editor):
Compute Package `declared_license_expression_spdx`, when missing,
from `declared_license_expression`, when available.
"""
- from licensedcode.cache import build_spdx_license_expression
+ from licensedcode.cache import build_spdx_license_expression, InvalidLicenseKeyError
+ from packageurl import PackageURL
Package = apps.get_model('packagedb', 'Package')
packages = Package.objects.filter(
~Q(declared_license_expression="") & Q(declared_license_expression_spdx="") |
Q(declared_license_expression__isnull=False) & Q(declared_license_expression_spdx__isnull=True)
)
+ package_count = packages.count()
+ chunk_size = 2000
+ iterator = packages.iterator(chunk_size=chunk_size)
+ updated = []
+ for i, package in enumerate(iterator):
+ if (not i % chunk_size) and updated:
+ Package.objects.bulk_update(
+ objs=updated,
+ fields=[
+ 'declared_license_expression_spdx',
+ ]
+ )
+ updated = []
+ print(f" {i:,} / {package_count:,} computed and updated")
+ try:
+ if spdx := build_spdx_license_expression(package.declared_license_expression):
+ package.declared_license_expression_spdx = spdx
+ updated.append(package)
+ except InvalidLicenseKeyError as e:
+ package_url = PackageURL(
+ type=package.type,
+ namespace=package.namespace,
+ name=package.name,
+ version=package.version,
+ qualifiers=package.qualifiers,
+ subpath=package.subpath
+ )
+ print(f" Error processing {package_url}: {e}")
- for package in packages:
- if spdx := build_spdx_license_expression(package.declared_license_expression):
- package.declared_license_expression_spdx = spdx
- package.save()
+ if updated:
+ print("Updating remaining Packages...")
+ Package.objects.bulk_update(
+ objs=updated,
+ fields=[
+ 'declared_license_expression_spdx',
+ ]
+ )
class Migration(migrations.Migration):
diff --git a/packagedb/migrations/0062_compute_resource_license_data.py b/packagedb/migrations/0062_compute_resource_license_data.py
index 0cef20fc..f1b996c8 100644
--- a/packagedb/migrations/0062_compute_resource_license_data.py
+++ b/packagedb/migrations/0062_compute_resource_license_data.py
@@ -13,13 +13,13 @@ def compute_resource_detected_license_expression(apps, schema_editor):
From scancode.io
"""
from license_expression import combine_expressions
- from licensedcode.cache import build_spdx_license_expression
+ from licensedcode.cache import build_spdx_license_expression, InvalidLicenseKeyError
if settings.IS_TESTS:
return
Resource = apps.get_model("packagedb", "Resource")
- resources = Resource.objects.filter(~Q(license_expressions=[]) | Q(license_expressions__isnull=False)).only('license_expressions')
+ resources = Resource.objects.filter(~Q(license_expressions=[])).filter(license_expressions__is_null=False)
object_count = resources.count()
print(f"\nCompute detected_license_expression for {object_count:,} resources.")
@@ -29,7 +29,11 @@ def compute_resource_detected_license_expression(apps, schema_editor):
unsaved_objects = []
for index, resource in enumerate(iterator, start=1):
- combined_expression = str(combine_expressions(resource.license_expressions))
+ combined_expression = combine_expressions(resource.license_expressions)
+ if not combined_expression:
+ print(f' invalid license expression for {resource.path}: {combined_expression}')
+ continue
+ combined_expression = str(combined_expression)
# gpl-2.0 OR broadcom-linking-unmodified OR proprietary-license
# build_spdx_license_expression("broadcom-linking-unmodified")
# AttributeError: 'LicenseSymbol' object has no attribute 'wrapped'
@@ -122,7 +126,7 @@ def compute_resource_license_detections(apps, schema_editor):
From scancode.io
"""
Resource = apps.get_model("packagedb", "Resource")
- resources = Resource.objects.filter(~Q(licenses=[]) | Q(licenses__isnull=False)).only('licenses')
+ resources = Resource.objects.filter(~Q(licenses=[])).filter(licenses__is_null=False)
object_count = resources.count()
print(f"\nCompute license_detections for {object_count:,} resources.")
diff --git a/packagedb/migrations/0070_auto_20230706_0045.py b/packagedb/migrations/0070_auto_20230706_0045.py
index 9d18cbdd..d9fa116a 100644
--- a/packagedb/migrations/0070_auto_20230706_0045.py
+++ b/packagedb/migrations/0070_auto_20230706_0045.py
@@ -66,8 +66,11 @@ def create_maven_package_sets(apps, schema_editor):
"version",
"qualifiers",
"subpath",
- ).iterator(
- chunk_size=5000
+ )
+ package_count = maven_packages_without_package_set.count()
+ chunk_size = 2000
+ iterator = maven_packages_without_package_set.iterator(
+ chunk_size=chunk_size
)
prev_namespace = None
@@ -75,7 +78,17 @@ def create_maven_package_sets(apps, schema_editor):
prev_version = None
prev_package = None
unupdated_packages = []
- for package in maven_packages_without_package_set:
+ for i, package in enumerate(iterator):
+ if not (i % chunk_size) and unupdated_packages:
+ Package.objects.bulk_update(
+ objs=unupdated_packages,
+ fields=[
+ "package_content",
+ ]
+ )
+ unupdated_packages = []
+ print(f" {i:,} / {package_count:,} updated")
+
if "source" in package.qualifiers:
package_content = PackageContentType.SOURCE_ARCHIVE
else:
diff --git a/packagedb/migrations/0078_alter_package_release_date.py b/packagedb/migrations/0078_alter_package_release_date.py
new file mode 100644
index 00000000..b33739fa
--- /dev/null
+++ b/packagedb/migrations/0078_alter_package_release_date.py
@@ -0,0 +1,22 @@
+# Generated by Django 4.1.2 on 2023-09-29 21:24
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("packagedb", "0077_remove_package_declared_license_expression_spdx_and_more"),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name="package",
+ name="release_date",
+ field=models.DateTimeField(
+ blank=True,
+ db_index=True,
+ help_text="The date and time that the package file was created, or when it was posted to its original download source.",
+ null=True,
+ ),
+ ),
+ ]
diff --git a/packagedb/models.py b/packagedb/models.py
index e4d7c58a..23588273 100644
--- a/packagedb/models.py
+++ b/packagedb/models.py
@@ -254,12 +254,12 @@ class AbstractPackage(models.Model):
"By convention the first line should be a summary when available."
),
)
- release_date = models.DateField(
+ release_date = models.DateTimeField(
blank=True,
null=True,
db_index=True,
help_text=_(
- "The date that the package file was created, or when "
+ "The date and time that the package file was created, or when "
"it was posted to its original download source."
),
)