Skip to content

Commit

Permalink
Fix issues found in packagedb migrations
Browse files Browse the repository at this point in the history
Signed-off-by: Jono Yang <[email protected]>
  • Loading branch information
JonoYang committed Oct 9, 2023
1 parent c55f7d1 commit 3e1bdb3
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 18 deletions.
28 changes: 22 additions & 6 deletions packagedb/migrations/0047_add_search_vector_field_to_package.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Generated by Django 3.1.5 on 2021-03-10 19:04

import django.contrib.postgres.search
from django.contrib.postgres.search import SearchVector, SearchVectorField
from django.db import migrations


Expand All @@ -9,10 +9,26 @@ def populate_search_vector_field(apps, schema_editor):
Data migration used to lowercase any purl field values that currently exist.
"""
Package = apps.get_model('packagedb', 'Package')

for pkg in Package.objects.iterator():
pkg.search_vector = search.SearchVector('namespace', 'name', 'version', 'download_url')
pkg.save()
resource_uris = Package.objects.iterator(chunk_size=5000)
updated = []
for i, package in enumerate(resource_uris):
if not i % 5000:
Package.objects.bulk_update(
objs=updated,
fields=[
'search_vector',
]
)
updated = []
package.search_vector = SearchVector('namespace', 'name', 'version', 'download_url')
updated.append(package)
if updated:
Package.objects.bulk_update(
objs=updated,
fields=[
'search_vector',
]
)


class Migration(migrations.Migration):
Expand All @@ -25,7 +41,7 @@ class Migration(migrations.Migration):
migrations.AddField(
model_name='package',
name='search_vector',
field=django.contrib.postgres.search.SearchVectorField(null=True),
field=SearchVectorField(null=True),
),
migrations.RunPython(populate_search_vector_field),
]
43 changes: 38 additions & 5 deletions packagedb/migrations/0059_compute_package_license_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,51 @@ def compute_package_declared_license_expression_spdx(apps, schema_editor):
Compute Package `declared_license_expression_spdx`, when missing,
from `declared_license_expression`, when available.
"""
from licensedcode.cache import build_spdx_license_expression
from licensedcode.cache import build_spdx_license_expression, InvalidLicenseKeyError
from packageurl import PackageURL

Package = apps.get_model('packagedb', 'Package')
packages = Package.objects.filter(
~Q(declared_license_expression="") & Q(declared_license_expression_spdx="") |
Q(declared_license_expression__isnull=False) & Q(declared_license_expression_spdx__isnull=True)
)
package_count = packages.count()
chunk_size = 2000
iterator = packages.iterator(chunk_size=chunk_size)
updated = []
for i, package in enumerate(iterator):
if (not i % chunk_size) and updated:
Package.objects.bulk_update(
objs=updated,
fields=[
'declared_license_expression_spdx',
]
)
updated = []
print(f" {i:,} / {package_count:,} computed and updated")
try:
if spdx := build_spdx_license_expression(package.declared_license_expression):
package.declared_license_expression_spdx = spdx
updated.append(package)
except InvalidLicenseKeyError as e:
package_url = PackageURL(
type=package.type,
namespace=package.namespace,
name=package.name,
version=package.version,
qualifiers=package.qualifiers,
subpath=package.subpath
)
print(f" Error processing {package_url}: {e}")

for package in packages:
if spdx := build_spdx_license_expression(package.declared_license_expression):
package.declared_license_expression_spdx = spdx
package.save()
if updated:
print("Updating remaining Packages...")
Package.objects.bulk_update(
objs=updated,
fields=[
'declared_license_expression_spdx',
]
)


class Migration(migrations.Migration):
Expand Down
12 changes: 8 additions & 4 deletions packagedb/migrations/0062_compute_resource_license_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ def compute_resource_detected_license_expression(apps, schema_editor):
From scancode.io
"""
from license_expression import combine_expressions
from licensedcode.cache import build_spdx_license_expression
from licensedcode.cache import build_spdx_license_expression, InvalidLicenseKeyError

if settings.IS_TESTS:
return

Resource = apps.get_model("packagedb", "Resource")
resources = Resource.objects.filter(~Q(license_expressions=[]) | Q(license_expressions__isnull=False)).only('license_expressions')
resources = Resource.objects.filter(~Q(license_expressions=[])).filter(license_expressions__is_null=False)

object_count = resources.count()
print(f"\nCompute detected_license_expression for {object_count:,} resources.")
Expand All @@ -29,7 +29,11 @@ def compute_resource_detected_license_expression(apps, schema_editor):

unsaved_objects = []
for index, resource in enumerate(iterator, start=1):
combined_expression = str(combine_expressions(resource.license_expressions))
combined_expression = combine_expressions(resource.license_expressions)
if not combined_expression:
print(f' invalid license expression for {resource.path}: {combined_expression}')
continue
combined_expression = str(combined_expression)
# gpl-2.0 OR broadcom-linking-unmodified OR proprietary-license
# build_spdx_license_expression("broadcom-linking-unmodified")
# AttributeError: 'LicenseSymbol' object has no attribute 'wrapped'
Expand Down Expand Up @@ -122,7 +126,7 @@ def compute_resource_license_detections(apps, schema_editor):
From scancode.io
"""
Resource = apps.get_model("packagedb", "Resource")
resources = Resource.objects.filter(~Q(licenses=[]) | Q(licenses__isnull=False)).only('licenses')
resources = Resource.objects.filter(~Q(licenses=[])).filter(licenses__is_null=False)

object_count = resources.count()
print(f"\nCompute license_detections for {object_count:,} resources.")
Expand Down
19 changes: 16 additions & 3 deletions packagedb/migrations/0070_auto_20230706_0045.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,29 @@ def create_maven_package_sets(apps, schema_editor):
"version",
"qualifiers",
"subpath",
).iterator(
chunk_size=5000
)
package_count = maven_packages_without_package_set.count()
chunk_size = 2000
iterator = maven_packages_without_package_set.iterator(
chunk_size=chunk_size
)

prev_namespace = None
prev_name = None
prev_version = None
prev_package = None
unupdated_packages = []
for package in maven_packages_without_package_set:
for i, package in enumerate(iterator):
if not (i % chunk_size) and unupdated_packages:
Package.objects.bulk_update(
objs=unupdated_packages,
fields=[
"package_content",
]
)
unupdated_packages = []
print(f" {i:,} / {package_count:,} updated")

if "source" in package.qualifiers:
package_content = PackageContentType.SOURCE_ARCHIVE
else:
Expand Down

0 comments on commit 3e1bdb3

Please sign in to comment.