Skip to content

Commit

Permalink
Create ProcessError table
Browse files Browse the repository at this point in the history
    * Append to ProcessError table whenever we encounter a package we cant create or update in update_maven_package_data

Signed-off-by: Jono Yang <[email protected]>
  • Loading branch information
JonoYang committed Nov 6, 2023
1 parent 25598a8 commit 8b1135e
Show file tree
Hide file tree
Showing 3 changed files with 239 additions and 85 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ postgres:
${SUDO_POSTGRES} dropdb packagedb || true
@echo "-> Create 'packagedb' database"
${SUDO_POSTGRES} createdb --encoding=utf-8 --owner=packagedb packagedb
@$(MAKE) migrate
# @$(MAKE) migrate

run:
${MANAGE} runserver 8001 --insecure
Expand Down
297 changes: 213 additions & 84 deletions minecode/management/commands/update_maven_package_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@
# See https://aboutcode.org for more information about nexB OSS projects.
#
from dateutil.parser import parse as dateutil_parse
from os.path import basename
import copy
import logging
import sys
import traceback

from django.db import transaction
from django.db.utils import DataError
from django.utils import timezone

from urllib3.util import Retry
Expand All @@ -21,6 +24,7 @@

from minecode.visitors.maven import collect_links_from_text
from minecode.visitors.maven import filter_for_artifacts
from minecode.models import ProcessingError
from packagedcode.maven import get_urls, build_filename
from minecode.management.commands import VerboseCommand
from packagedb.models import Package
Expand Down Expand Up @@ -98,54 +102,118 @@ def handle(self, *args, **options):
if not i % 2000:
updated = False
if unsaved_existing_packages:
with transaction.atomic():
Package.objects.bulk_update(
objs=unsaved_existing_packages,
fields=[
'download_url',
'repository_homepage_url',
'repository_download_url',
'api_data_url',
'release_date',
'last_modified_date',
'history',
]
)
updated_packages_count += len(unsaved_existing_packages)
unsaved_existing_packages = []
updated = True
try:
with transaction.atomic():
Package.objects.bulk_update(
objs=unsaved_existing_packages,
fields=[
'download_url',
'repository_homepage_url',
'repository_download_url',
'api_data_url',
'release_date',
'last_modified_date',
'history',
]
)
saved_packages_count = len(unsaved_existing_packages)
except DataError:
saved_packages_count = 0
with transaction.atomic():
# Update each record individually and then try to catch the package causing problems
for unsaved_package in unsaved_existing_packages:
try:
unsaved_package.save()
saved_packages_count += 1
except DataError:
service = basename(__file__)
traceback_message = traceback.format_exc()
message = f'Error updating Package {unsaved_package.package_uid}:\n\n{traceback_message}'
ProcessingError.objects.create(
service=service,
date=timezone.now(),
error_message=message,
)
logger.error(message)
finally:
updated_packages_count += saved_packages_count
unsaved_existing_packages = []
if saved_packages_count > 0:
updated = True

if unsaved_existing_packages_lowercased:
with transaction.atomic():
Package.objects.bulk_update(
objs=unsaved_existing_packages_lowercased,
fields=[
'namespace',
'name',
'version',
'qualifiers',
'download_url',
'repository_homepage_url',
'repository_download_url',
'api_data_url',
'release_date',
'last_modified_date',
'history',
]
)
updated_packages_count += len(unsaved_existing_packages_lowercased)
unsaved_existing_packages_lowercased = []
updated = True
try:
with transaction.atomic():
Package.objects.bulk_update(
objs=unsaved_existing_packages_lowercased,
fields=[
'namespace',
'name',
'version',
'qualifiers',
'download_url',
'repository_homepage_url',
'repository_download_url',
'api_data_url',
'release_date',
'last_modified_date',
'history',
]
)
saved_packages_count = len(unsaved_existing_packages_lowercased)
except DataError:
saved_packages_count = 0
with transaction.atomic():
# Update each record individually and then try to catch the package causing problems
for unsaved_package in unsaved_existing_packages_lowercased:
try:
unsaved_package.save()
saved_packages_count += 1
except DataError:
service = basename(__file__)
traceback_message = traceback.format_exc()
message = f'Error updating Package {unsaved_package.package_uid}:\n\n{traceback_message}'
ProcessingError.objects.create(
service=service,
date=timezone.now(),
error_message=message,
)
logger.error(message)
finally:
updated_packages_count += saved_packages_count
unsaved_existing_packages_lowercased = []
if saved_packages_count > 0:
updated = True

if updated:
logger.info(f'Updated {updated_packages_count:,} Maven Packages')

if unsaved_new_packages:
with transaction.atomic():
Package.objects.bulk_create(unsaved_new_packages)
created_packages_count += len(unsaved_new_packages)
unsaved_new_packages = []
logger.info(f'Created {created_packages_count:,} Maven Packages')
try:
with transaction.atomic():
Package.objects.bulk_create(unsaved_new_packages)
pc = len(unsaved_new_packages)
except DataError:
pc = 0
for unsaved_package in unsaved_new_packages:
try:
unsaved_package.save()
pc += 1
except DataError:
service = basename(__file__)
traceback_message = traceback.format_exc()
message = f'Error creating Package {unsaved_package.purl}:\n\n{traceback_message}'
ProcessingError.objects.create(
service=service,
date=timezone.now(),
error_message=message,
)
logger.error(message)
finally:
created_packages_count += pc
unsaved_new_packages = []
if pc > 0:
logger.info(f'Created {created_packages_count:,} Maven Packages')

logger.info(f'Deleted {deleted_packages_count:,} Duplicate Maven Packages')

Expand Down Expand Up @@ -293,49 +361,110 @@ def handle(self, *args, **options):
unsaved_new_packages.append(new_package)
logger.debug(f'Created Package {maven_package.purl}')

if unsaved_existing_packages:
with transaction.atomic():
Package.objects.bulk_update(
objs=unsaved_existing_packages,
fields=[
'download_url',
'repository_homepage_url',
'repository_download_url',
'api_data_url',
'release_date',
'last_modified_date',
'history',
]
)
updated_packages_count += len(unsaved_existing_packages)
unsaved_existing_packages = []

if unsaved_existing_packages_lowercased:
with transaction.atomic():
Package.objects.bulk_update(
objs=unsaved_existing_packages_lowercased,
fields=[
'namespace',
'name',
'version',
'qualifiers',
'download_url',
'repository_homepage_url',
'repository_download_url',
'api_data_url',
'release_date',
'last_modified_date',
'history',
]
)
updated_packages_count += len(unsaved_existing_packages_lowercased)
unsaved_existing_packages_lowercased = []

if unsaved_new_packages:
with transaction.atomic():
Package.objects.bulk_create(unsaved_new_packages)
created_packages_count += len(unsaved_new_packages)
unsaved_new_packages = []
if unsaved_existing_packages:
try:
with transaction.atomic():
Package.objects.bulk_update(
objs=unsaved_existing_packages,
fields=[
'download_url',
'repository_homepage_url',
'repository_download_url',
'api_data_url',
'release_date',
'last_modified_date',
'history',
]
)
saved_packages_count = len(unsaved_existing_packages)
except DataError:
saved_packages_count = 0
with transaction.atomic():
# Update each record individually and then try to catch the package causing problems
for unsaved_package in unsaved_existing_packages:
try:
unsaved_package.save()
saved_packages_count += 1
except DataError:
service = basename(__file__)
traceback_message = traceback.format_exc()
message = f'Error updating Package {unsaved_package.package_uid}:\n\n{traceback_message}'
ProcessingError.objects.create(
service=service,
date=timezone.now(),
error_message=message,
)
logger.error(message)
finally:
updated_packages_count += saved_packages_count
unsaved_existing_packages = []

if unsaved_existing_packages_lowercased:
try:
with transaction.atomic():
Package.objects.bulk_update(
objs=unsaved_existing_packages_lowercased,
fields=[
'namespace',
'name',
'version',
'qualifiers',
'download_url',
'repository_homepage_url',
'repository_download_url',
'api_data_url',
'release_date',
'last_modified_date',
'history',
]
)
saved_packages_count = len(unsaved_existing_packages_lowercased)
except DataError:
saved_packages_count = 0
with transaction.atomic():
# Update each record individually and then try to catch the package causing problems
for unsaved_package in unsaved_existing_packages_lowercased:
try:
unsaved_package.save()
saved_packages_count += 1
except DataError:
service = basename(__file__)
traceback_message = traceback.format_exc()
message = f'Error updating Package {unsaved_package.package_uid}:\n\n{traceback_message}'
ProcessingError.objects.create(
service=service,
date=timezone.now(),
error_message=message,
)
logger.error(message)
finally:
updated_packages_count += saved_packages_count
unsaved_existing_packages_lowercased = []

if unsaved_new_packages:
try:
with transaction.atomic():
Package.objects.bulk_create(unsaved_new_packages)
pc = len(unsaved_new_packages)
except DataError:
pc = 0
for unsaved_package in unsaved_new_packages:
try:
unsaved_package.save()
pc += 1
except DataError:
service = basename(__file__)
traceback_message = traceback.format_exc()
message = f'Error creating Package {unsaved_package.purl}:\n\n{traceback_message}'
ProcessingError.objects.create(
service=service,
date=timezone.now(),
error_message=message,
)
logger.error(message)
finally:
created_packages_count += pc
unsaved_new_packages = []

logger.info(f'Updated {updated_packages_count:,} Maven Packages')
logger.info(f'Created {created_packages_count:,} Maven Packages')
Expand Down
25 changes: 25 additions & 0 deletions minecode/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1063,3 +1063,28 @@ def save(self, *args, **kwargs):
"""
self.normalize_fields()
super(ImportableURI, self).save(*args, **kwargs)


class ProcessingError(BaseURI):
service = models.CharField(
max_length=100,
null=True,
blank=True,
help_text='The name of the service running where the error occured.'
)

date = models.DateTimeField(
null=True,
blank=True,
db_index=True,
help_text='Timestamp set to the date of when this error occured.',
)

error_message = models.TextField(
null=True,
blank=True,
help_text='The message associated with this error'
)

class Meta:
verbose_name = 'Processing Error'

0 comments on commit 8b1135e

Please sign in to comment.