aboutcode-org · JonoYang · Oct 18, 2023 · Sep 6, 2023 · Sep 12, 2023 · Sep 12, 2023
diff --git a/matchcode/api.py b/matchcode/api.py
@@ -15,6 +15,7 @@
 from rest_framework.decorators import action
 from rest_framework.response import Response
 from rest_framework.serializers import CharField
+from rest_framework.serializers import FloatField
 from rest_framework.serializers import HyperlinkedRelatedField
 from rest_framework.serializers import ModelSerializer
 from rest_framework.serializers import ReadOnlyField
@@ -24,6 +25,7 @@
 from matchcode_toolkit.fingerprinting import create_halohash_chunks
 from matchcode_toolkit.fingerprinting import hexstring_to_binarray
 from matchcode_toolkit.fingerprinting import split_fingerprint
+from matchcode_toolkit.halohash import byte_hamming_distance
 from matchcode.models import ExactFileIndex
 from matchcode.models import ExactPackageArchiveIndex
 from matchcode.models import ApproximateDirectoryContentIndex
@@ -91,6 +93,7 @@ class BaseDirectoryIndexMatchSerializer(Serializer):
         lookup_field='uuid',
         read_only=True
     )
+    similarity_score = FloatField()
 
 
 class CharMultipleWidget(widgets.TextInput):
@@ -271,11 +274,18 @@ def match(self, request):
         for fingerprint in unique_fingerprints:
             matches = model_class.match(fingerprint)
             for match in matches:
+                _, bah128 = split_fingerprint(fingerprint)
+                # Get fingerprint from the match
+                fp = match.fingerprint()
+                _, match_bah128 = split_fingerprint(fp)
+                hd = byte_hamming_distance(bah128, match_bah128)
+                similarity_score = (128 - hd) / 128
                 results.append(
                     {
                         'fingerprint': fingerprint,
-                        'matched_fingerprint': match.fingerprint(),
+                        'matched_fingerprint': fp,
                         'package': match.package,
+                        'similarity_score': similarity_score,
                     }
                 )
 

diff --git a/matchcode/tests/test_api.py b/matchcode/tests/test_api.py
@@ -117,6 +117,7 @@ def test_api_approximate_directory_content_index_match_close_match(self):
         self.assertEqual(expected_matched_fingerprint, result['matched_fingerprint'])
         expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package1.uuid])
         self.assertEqual(expected_package, result['package'])
+        self.assertEqual(0.9453125, result['similarity_score'])
 
     def test_api_approximate_directory_structure_index_match_close_match(self):
         # This test fingerprint has a hamming distance of 7 from the expected fingerprint
@@ -133,6 +134,7 @@ def test_api_approximate_directory_structure_index_match_close_match(self):
         self.assertEqual(expected_matched_fingerprint, result['matched_fingerprint'])
         expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package2.uuid])
         self.assertEqual(expected_package, result['package'])
+        self.assertEqual(0.9453125, result['similarity_score'])
 
     def test_api_approximate_directory_content_index_match(self):
         test_fingerprint = '00000007af7d63765c78fa516b5353f5ffa7df45'
@@ -147,6 +149,7 @@ def test_api_approximate_directory_content_index_match(self):
         self.assertEqual(test_fingerprint, result['matched_fingerprint'])
         expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package1.uuid])
         self.assertEqual(expected_package, result['package'])
+        self.assertEqual(1.0, result['similarity_score'])
 
     def test_api_approximate_directory_structure_index_match(self):
         test_fingerprint = '00000004d10982208810240820080a6a3e852486'
@@ -161,3 +164,4 @@ def test_api_approximate_directory_structure_index_match(self):
         self.assertEqual(test_fingerprint, result['matched_fingerprint'])
         expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package2.uuid])
         self.assertEqual(expected_package, result['package'])
+        self.assertEqual(1.0, result['similarity_score'])
diff --git a/minecode/management/commands/get_maven_release_dates.py b/minecode/management/commands/get_maven_release_dates.py
@@ -0,0 +1,75 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# purldb is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/nexB/purldb for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+from dateutil.parser import parse as dateutil_parse
+from os.path import dirname
+import logging
+import sys
+
+import requests
+
+from minecode.management.commands import VerboseCommand
+from minecode.visitors.maven import collect_links_from_text
+from minecode.visitors.maven import filter_for_artifacts
+from packagedb.models import Package
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout)
+logger.setLevel(logging.INFO)
+
+TRACE = False
+if TRACE:
+    logger.setLevel(logging.DEBUG)
+
+
+class Command(VerboseCommand):
+    help = 'Get and set release_date for Maven Packages'
+
+    def handle(self, *args, **options):
+        queryset = Package.objects.filter(
+            type='maven',
+            release_date=None,
+            download_url__startswith='https://repo1.maven.org/maven2'
+        )
+        object_count = queryset.count()
+        chunk_size = 2000
+        iterator = queryset.iterator(chunk_size=chunk_size)
+        unsaved_objects = []
+
+        logger.info(f'Updating release_date for {object_count} packages')
+        for index, package in enumerate(iterator, start=1):
+            download_url = package.download_url
+            package_url = package.package_url
+            logger.info(f'Updating release_date for package {package_url} ({download_url})')
+            package_version_page_url = dirname(download_url)
+            filename = download_url.rsplit('/')[-1]
+            response = requests.get(package_version_page_url)
+            if response:
+                timestamps_by_links = collect_links_from_text(response.text, filter=filter_for_artifacts)
+                timestamp = timestamps_by_links.get(filename)
+                if not timestamp:
+                    logger.info(f'\tCould not get release_date for package {package_url} ({download_url})')
+                    continue
+                timestamp = dateutil_parse(timestamp)
+                package.release_date = timestamp
+                unsaved_objects.append(package)
+                logger.info(f'\t{package_url} ({download_url}) release_date has been updated to {timestamp}')
+            else:
+                logger.info(f'\t{package_url} not updated: error encountered when visiting {package_version_page_url}')
+            if not (index % chunk_size) and unsaved_objects:
+                logger.info(f'{index:,} / {object_count:,} Packages processed')
+
+        logger.info('Updating Package objects...')
+        updated_packages_count = Package.objects.bulk_update(
+            objs=unsaved_objects,
+            fields=['release_date'],
+            batch_size=1000,
+        )
+        logger.info(f'Updated {updated_packages_count} Package objects')
diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py
@@ -0,0 +1,167 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# purldb is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/nexB/purldb for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+from dateutil.parser import parse as dateutil_parse
+import logging
+import signal
+import sys
+import time
+
+import requests
+
+from django.db import transaction
+from django.utils import timezone
+from packageurl import PackageURL
+
+from minecode.management.commands import get_error_message
+from minecode.management.commands import VerboseCommand
+from minecode.models import ImportableURI
+from minecode.visitors.maven import get_artifact_links
+from minecode.visitors.maven import get_classifier_from_artifact_url
+from minecode.visitors.maven import collect_links_from_text
+from minecode.visitors.maven import filter_only_directories
+from minecode.visitors.maven import get_artifact_sha1
+from minecode.model_utils import merge_or_create_package
+from packagedcode.models import PackageData
+from packagedb.models import Package
+from minecode.visitors.maven import determine_namespace_name_version_from_url
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout)
+logger.setLevel(logging.INFO)
+
+TRACE = False
+if TRACE:
+    logger.setLevel(logging.DEBUG)
+
+# sleep duration in seconds when the queue is empty
+SLEEP_WHEN_EMPTY = 10
+
+MUST_STOP = False
+
+
+def stop_handler(*args, **kwargs):
+    """
+    Signal handler to set global variable to True.
+    """
+    global MUST_STOP
+    MUST_STOP = True
+
+
+signal.signal(signal.SIGTERM, stop_handler)
+
+
+class Command(VerboseCommand):
+    help = 'Run a Package request queue.'
+
+    def handle(self, *args, **options):
+        """
+        Get the next processable PriorityResourceURI and start the
+        processing. Loops forever and sleeps a short while if there are
+        no PriorityResourceURI left to process.
+        """
+
+        global MUST_STOP
+
+        sleeping = False
+        processed_counter = 0
+
+        while True:
+            if MUST_STOP:
+                logger.info('Graceful exit of the request queue.')
+                break
+
+            with transaction.atomic():
+                importable_uri = ImportableURI.objects.get_next_request()
+
+            if not importable_uri:
+                # Only log a single message when we go to sleep
+                if not sleeping:
+                    sleeping = True
+                    logger.info('No more processable request, sleeping...')
+
+                time.sleep(SLEEP_WHEN_EMPTY)
+                continue
+
+            sleeping = False
+
+            # process request
+            logger.info('Processing {}'.format(importable_uri))
+            try:
+                errors = process_request(importable_uri)
+            except Exception as e:
+                errors = 'Error: Failed to process ImportableURI: {}\n'.format(
+                    repr(importable_uri))
+                errors += get_error_message(e)
+            finally:
+                if errors:
+                    importable_uri.processing_error = errors
+                    logger.error(errors)
+                importable_uri.processed_date = timezone.now()
+                importable_uri.wip_date = None
+                importable_uri.save()
+                processed_counter += 1
+
+        return processed_counter
+
+
+def process_request(importable_uri):
+    uri = importable_uri.uri
+    uri = uri.rstrip('/')
+    data = importable_uri.data
+    if not data:
+        # collect data again if we don't have it
+        response = requests.get(uri)
+        if response:
+            data = requests.text
+
+    purl = importable_uri.package_url
+    if purl:
+        package_url = PackageURL.from_string(purl)
+        namespace = package_url.namespace
+        name = package_url.name
+    else:
+        namespace, name, _ = determine_namespace_name_version_from_url(uri)
+
+    timestamps_by_directory_links = collect_links_from_text(data, filter_only_directories)
+    # Go into each version directory
+    for directory_link in timestamps_by_directory_links.keys():
+        version = directory_link.rstrip('/')
+        version_page_url = f'{uri}/{version}'
+        timestamps_by_artifact_links = get_artifact_links(version_page_url)
+        for artifact_link, timestamp in timestamps_by_artifact_links.items():
+            sha1 = get_artifact_sha1(artifact_link)
+            classifier = get_classifier_from_artifact_url(artifact_link, version_page_url, name, version)
+            qualifiers = None
+            if classifier:
+                qualifiers = f'classifier={classifier}'
+            release_date = dateutil_parse(timestamp)
+            package_data = PackageData(
+                type='maven',
+                namespace=namespace,
+                name=name,
+                version=version,
+                qualifiers=qualifiers,
+                download_url=artifact_link,
+                sha1=sha1,
+                release_date=release_date,
+            )
+            package, created, merged, map_error = merge_or_create_package(
+                scanned_package=package_data,
+                visit_level=50
+            )
+            if created:
+                logger.info(f'Created package {package}')
+            if merged:
+                logger.info(f'Updated package {package}')
+            if map_error:
+                logger.error(f'Error encountered: {map_error}')
+                importable_uri.processing_error = map_error
+                importable_uri.save()
diff --git a/minecode/management/commands/maven_crawler.py b/minecode/management/commands/maven_crawler.py
@@ -0,0 +1,31 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# purldb is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/nexB/purldb for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+import logging
+import sys
+
+from minecode.management.commands import VerboseCommand
+from minecode.visitors.maven import crawl_maven_repo_from_root
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout)
+logger.setLevel(logging.INFO)
+
+TRACE = False
+if TRACE:
+    logger.setLevel(logging.DEBUG)
+
+
+class Command(VerboseCommand):
+    help = 'Run a Package request queue.'
+
+    def handle(self, *args, **options):
+        maven_root_url = 'https://repo.maven.apache.org/maven2'
+        crawl_maven_repo_from_root(root_url=maven_root_url)