Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

179 focused maven visitor #192

Merged
merged 19 commits into from
Oct 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion matchcode/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from rest_framework.decorators import action
from rest_framework.response import Response
from rest_framework.serializers import CharField
from rest_framework.serializers import FloatField
from rest_framework.serializers import HyperlinkedRelatedField
from rest_framework.serializers import ModelSerializer
from rest_framework.serializers import ReadOnlyField
Expand All @@ -24,6 +25,7 @@
from matchcode_toolkit.fingerprinting import create_halohash_chunks
from matchcode_toolkit.fingerprinting import hexstring_to_binarray
from matchcode_toolkit.fingerprinting import split_fingerprint
from matchcode_toolkit.halohash import byte_hamming_distance
from matchcode.models import ExactFileIndex
from matchcode.models import ExactPackageArchiveIndex
from matchcode.models import ApproximateDirectoryContentIndex
Expand Down Expand Up @@ -91,6 +93,7 @@ class BaseDirectoryIndexMatchSerializer(Serializer):
lookup_field='uuid',
read_only=True
)
similarity_score = FloatField()


class CharMultipleWidget(widgets.TextInput):
Expand Down Expand Up @@ -271,11 +274,18 @@ def match(self, request):
for fingerprint in unique_fingerprints:
matches = model_class.match(fingerprint)
for match in matches:
_, bah128 = split_fingerprint(fingerprint)
# Get fingerprint from the match
fp = match.fingerprint()
_, match_bah128 = split_fingerprint(fp)
hd = byte_hamming_distance(bah128, match_bah128)
similarity_score = (128 - hd) / 128
results.append(
{
'fingerprint': fingerprint,
'matched_fingerprint': match.fingerprint(),
'matched_fingerprint': fp,
'package': match.package,
'similarity_score': similarity_score,
}
)

Expand Down
4 changes: 4 additions & 0 deletions matchcode/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def test_api_approximate_directory_content_index_match_close_match(self):
self.assertEqual(expected_matched_fingerprint, result['matched_fingerprint'])
expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package1.uuid])
self.assertEqual(expected_package, result['package'])
self.assertEqual(0.9453125, result['similarity_score'])

def test_api_approximate_directory_structure_index_match_close_match(self):
# This test fingerprint has a hamming distance of 7 from the expected fingerprint
Expand All @@ -133,6 +134,7 @@ def test_api_approximate_directory_structure_index_match_close_match(self):
self.assertEqual(expected_matched_fingerprint, result['matched_fingerprint'])
expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package2.uuid])
self.assertEqual(expected_package, result['package'])
self.assertEqual(0.9453125, result['similarity_score'])

def test_api_approximate_directory_content_index_match(self):
test_fingerprint = '00000007af7d63765c78fa516b5353f5ffa7df45'
Expand All @@ -147,6 +149,7 @@ def test_api_approximate_directory_content_index_match(self):
self.assertEqual(test_fingerprint, result['matched_fingerprint'])
expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package1.uuid])
self.assertEqual(expected_package, result['package'])
self.assertEqual(1.0, result['similarity_score'])

def test_api_approximate_directory_structure_index_match(self):
test_fingerprint = '00000004d10982208810240820080a6a3e852486'
Expand All @@ -161,3 +164,4 @@ def test_api_approximate_directory_structure_index_match(self):
self.assertEqual(test_fingerprint, result['matched_fingerprint'])
expected_package = 'http://testserver' + reverse('api:package-detail', args=[self.test_package2.uuid])
self.assertEqual(expected_package, result['package'])
self.assertEqual(1.0, result['similarity_score'])
75 changes: 75 additions & 0 deletions minecode/management/commands/get_maven_release_dates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

from dateutil.parser import parse as dateutil_parse
from os.path import dirname
import logging
import sys

import requests

from minecode.management.commands import VerboseCommand
from minecode.visitors.maven import collect_links_from_text
from minecode.visitors.maven import filter_for_artifacts
from packagedb.models import Package


logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.INFO)

TRACE = False
if TRACE:
logger.setLevel(logging.DEBUG)


class Command(VerboseCommand):
help = 'Get and set release_date for Maven Packages'

def handle(self, *args, **options):
queryset = Package.objects.filter(
type='maven',
release_date=None,
download_url__startswith='https://repo1.maven.org/maven2'
)
object_count = queryset.count()
chunk_size = 2000
iterator = queryset.iterator(chunk_size=chunk_size)
unsaved_objects = []

logger.info(f'Updating release_date for {object_count} packages')
for index, package in enumerate(iterator, start=1):
download_url = package.download_url
package_url = package.package_url
logger.info(f'Updating release_date for package {package_url} ({download_url})')
package_version_page_url = dirname(download_url)
filename = download_url.rsplit('/')[-1]
response = requests.get(package_version_page_url)
if response:
timestamps_by_links = collect_links_from_text(response.text, filter=filter_for_artifacts)
timestamp = timestamps_by_links.get(filename)
if not timestamp:
logger.info(f'\tCould not get release_date for package {package_url} ({download_url})')
continue
timestamp = dateutil_parse(timestamp)
package.release_date = timestamp
unsaved_objects.append(package)
logger.info(f'\t{package_url} ({download_url}) release_date has been updated to {timestamp}')
else:
logger.info(f'\t{package_url} not updated: error encountered when visiting {package_version_page_url}')
if not (index % chunk_size) and unsaved_objects:
logger.info(f'{index:,} / {object_count:,} Packages processed')

logger.info('Updating Package objects...')
updated_packages_count = Package.objects.bulk_update(
objs=unsaved_objects,
fields=['release_date'],
batch_size=1000,
)
logger.info(f'Updated {updated_packages_count} Package objects')
167 changes: 167 additions & 0 deletions minecode/management/commands/import_queue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

from dateutil.parser import parse as dateutil_parse
import logging
import signal
import sys
import time

import requests

from django.db import transaction
from django.utils import timezone
from packageurl import PackageURL

from minecode.management.commands import get_error_message
from minecode.management.commands import VerboseCommand
from minecode.models import ImportableURI
from minecode.visitors.maven import get_artifact_links
from minecode.visitors.maven import get_classifier_from_artifact_url
from minecode.visitors.maven import collect_links_from_text
from minecode.visitors.maven import filter_only_directories
from minecode.visitors.maven import get_artifact_sha1
from minecode.model_utils import merge_or_create_package
from packagedcode.models import PackageData
from packagedb.models import Package
from minecode.visitors.maven import determine_namespace_name_version_from_url


logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.INFO)

TRACE = False
if TRACE:
logger.setLevel(logging.DEBUG)

# sleep duration in seconds when the queue is empty
SLEEP_WHEN_EMPTY = 10

MUST_STOP = False


def stop_handler(*args, **kwargs):
"""
Signal handler to set global variable to True.
"""
global MUST_STOP
MUST_STOP = True


signal.signal(signal.SIGTERM, stop_handler)


class Command(VerboseCommand):
help = 'Run a Package request queue.'

def handle(self, *args, **options):
"""
Get the next processable PriorityResourceURI and start the
processing. Loops forever and sleeps a short while if there are
no PriorityResourceURI left to process.
"""

global MUST_STOP

sleeping = False
processed_counter = 0

while True:
if MUST_STOP:
logger.info('Graceful exit of the request queue.')
break

with transaction.atomic():
importable_uri = ImportableURI.objects.get_next_request()

if not importable_uri:
# Only log a single message when we go to sleep
if not sleeping:
sleeping = True
logger.info('No more processable request, sleeping...')

time.sleep(SLEEP_WHEN_EMPTY)
continue

sleeping = False

# process request
logger.info('Processing {}'.format(importable_uri))
try:
errors = process_request(importable_uri)
except Exception as e:
errors = 'Error: Failed to process ImportableURI: {}\n'.format(
repr(importable_uri))
errors += get_error_message(e)
finally:
if errors:
importable_uri.processing_error = errors
logger.error(errors)
importable_uri.processed_date = timezone.now()
importable_uri.wip_date = None
importable_uri.save()
processed_counter += 1

return processed_counter


def process_request(importable_uri):
uri = importable_uri.uri
uri = uri.rstrip('/')
data = importable_uri.data
if not data:
# collect data again if we don't have it
response = requests.get(uri)
if response:
data = requests.text

purl = importable_uri.package_url
if purl:
package_url = PackageURL.from_string(purl)
namespace = package_url.namespace
name = package_url.name
else:
namespace, name, _ = determine_namespace_name_version_from_url(uri)

timestamps_by_directory_links = collect_links_from_text(data, filter_only_directories)
# Go into each version directory
for directory_link in timestamps_by_directory_links.keys():
version = directory_link.rstrip('/')
version_page_url = f'{uri}/{version}'
timestamps_by_artifact_links = get_artifact_links(version_page_url)
for artifact_link, timestamp in timestamps_by_artifact_links.items():
sha1 = get_artifact_sha1(artifact_link)
classifier = get_classifier_from_artifact_url(artifact_link, version_page_url, name, version)
qualifiers = None
if classifier:
qualifiers = f'classifier={classifier}'
release_date = dateutil_parse(timestamp)
package_data = PackageData(
type='maven',
namespace=namespace,
name=name,
version=version,
qualifiers=qualifiers,
download_url=artifact_link,
sha1=sha1,
release_date=release_date,
)
package, created, merged, map_error = merge_or_create_package(
scanned_package=package_data,
visit_level=50
)
if created:
logger.info(f'Created package {package}')
if merged:
logger.info(f'Updated package {package}')
if map_error:
logger.error(f'Error encountered: {map_error}')
importable_uri.processing_error = map_error
importable_uri.save()
31 changes: 31 additions & 0 deletions minecode/management/commands/maven_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import logging
import sys

from minecode.management.commands import VerboseCommand
from minecode.visitors.maven import crawl_maven_repo_from_root


logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.INFO)

TRACE = False
if TRACE:
logger.setLevel(logging.DEBUG)


class Command(VerboseCommand):
help = 'Run a Package request queue.'

def handle(self, *args, **options):
maven_root_url = 'https://repo.maven.apache.org/maven2'
crawl_maven_repo_from_root(root_url=maven_root_url)
Loading
Loading