generated from aboutcode-org/skeleton
-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create command to populate Package.search_vector
Signed-off-by: Jono Yang <[email protected]>
- Loading branch information
Showing
1 changed file
with
82 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
# | ||
# Copyright (c) nexB Inc. and others. All rights reserved. | ||
# purldb is a trademark of nexB Inc. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. | ||
# See https://github.com/nexB/purldb for support or download. | ||
# See https://aboutcode.org for more information about nexB OSS projects. | ||
# | ||
|
||
import copy | ||
import logging | ||
import sys | ||
|
||
from django.contrib.postgres.search import SearchVector | ||
from django.db import transaction | ||
|
||
from minecode.management.commands import VerboseCommand | ||
from packagedb.models import Package | ||
|
||
TRACE = False | ||
|
||
logger = logging.getLogger(__name__) | ||
logging.basicConfig(stream=sys.stdout) | ||
logger.setLevel(logging.INFO) | ||
|
||
|
||
# This is from https://stackoverflow.com/questions/4856882/limiting-memory-use-in-a-large-django-queryset/5188179#5188179 | ||
class MemorySavingQuerysetIterator(object): | ||
|
||
def __init__(self,queryset,max_obj_num=1000): | ||
self._base_queryset = queryset | ||
self._generator = self._setup() | ||
self.max_obj_num = max_obj_num | ||
|
||
def _setup(self): | ||
for i in range(0,self._base_queryset.count(),self.max_obj_num): | ||
# By making a copy of of the queryset and using that to actually access | ||
# the objects we ensure that there are only `max_obj_num` objects in | ||
# memory at any given time | ||
smaller_queryset = copy.deepcopy(self._base_queryset)[i:i+self.max_obj_num] | ||
logger.debug('Grabbing next %s objects from DB' % self.max_obj_num) | ||
for obj in smaller_queryset.iterator(): | ||
yield obj | ||
|
||
def __iter__(self): | ||
return self._generator | ||
|
||
def next(self): | ||
return self._generator.next() | ||
|
||
|
||
class Command(VerboseCommand): | ||
def handle(self, *args, **options): | ||
packages_without_search_vectors = Package.objects.using('default').filter(search_vector__isnull=True) | ||
packages_without_search_vectors_count = packages_without_search_vectors.count() | ||
updated = [] | ||
print(f"Populating the `search_vector` field for {packages_without_search_vectors_count:,} Packages from the 'default` database") | ||
i = 0 | ||
for package in MemorySavingQuerysetIterator(packages_without_search_vectors): | ||
if not i % 2000 and updated: | ||
with transaction.atomic(): | ||
Package.objects.using('default').bulk_update( | ||
objs=updated, | ||
fields=[ | ||
'search_vector', | ||
] | ||
) | ||
updated = [] | ||
print(f" {i:,} / {packages_without_search_vectors_count:,} Package `search_vector`s populated") | ||
package.search_vector = SearchVector('namespace', 'name', 'version', 'download_url') | ||
updated.append(package) | ||
i += 1 | ||
if updated: | ||
with transaction.atomic(): | ||
Package.objects.using('default').bulk_update( | ||
objs=updated, | ||
fields=[ | ||
'search_vector', | ||
] | ||
) | ||
updated = [] | ||
print(f"{i:,} Package `search_vector`s populated") |