From 61a111eb2424f79dcf564413748a638664f8e14b Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 21 Oct 2024 09:23:52 -0400 Subject: [PATCH] (wip) osf:storageByteCount supplementary metadata [ENG-6187] --- api/caching/tasks.py | 67 +++++++++++++++++++++++------------ osf/metadata/osf_gathering.py | 16 +++++++++ 2 files changed, 61 insertions(+), 22 deletions(-) diff --git a/api/caching/tasks.py b/api/caching/tasks.py index 0b7a4b6670f..dba6785d5db 100644 --- a/api/caching/tasks.py +++ b/api/caching/tasks.py @@ -1,11 +1,12 @@ +import logging from urllib.parse import urlparse + +from django.apps import apps +from django.contrib.contenttypes.models import ContentType from django.db import connection from django.db.models import Sum - import requests -import logging -from django.apps import apps from api.caching.utils import storage_usage_cache from framework.postcommit_tasks.handlers import enqueue_postcommit_task @@ -114,32 +115,54 @@ def ban_url(instance): def update_storage_usage_cache(target_id, target_guid, per_page=500000): if not settings.ENABLE_STORAGE_USAGE_CACHE: return - sql = """ - SELECT count(size), sum(size) from - (SELECT size FROM osf_basefileversionsthrough AS obfnv - LEFT JOIN osf_basefilenode file ON obfnv.basefilenode_id = file.id - LEFT JOIN osf_fileversion version ON obfnv.fileversion_id = version.id - LEFT JOIN django_content_type type on file.target_content_type_id = type.id + from osf.models import Guid + storage_usage_total = compute_storage_usage_total(Guid.load(target_guid)) + key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_guid) + storage_usage_cache.set(key, storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT) + + +def compute_storage_usage_total(target_obj, per_page=500000): + sql = """SELECT count(size), sum(size) from ( + SELECT version.size AS size + FROM osf_basefileversionsthrough AS obfnv + LEFT JOIN osf_basefilenode AS file ON obfnv.basefilenode_id = file.id + LEFT JOIN osf_fileversion AS version ON obfnv.fileversion_id = version.id WHERE file.provider = 'osfstorage' - AND type.model = 'abstractnode' AND file.deleted_on IS NULL - AND file.target_object_id=%s + AND file.target_object_id=%(target_id)s + AND file.target_content_type_id = %(target_content_type_id)s ORDER BY version.id - LIMIT %s OFFSET %s) file_page + LIMIT %(per_page)s OFFSET %(offset)s + ) """ - count = per_page + last_count = 1 # initialize non-zero offset = 0 storage_usage_total = 0 with connection.cursor() as cursor: - while count: - cursor.execute(sql, [target_id, per_page, offset]) - result = cursor.fetchall() - storage_usage_total += int(result[0][1]) if result[0][1] else 0 - count = int(result[0][0]) if result[0][0] else 0 - offset += count - - key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_guid) - storage_usage_cache.set(key, storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT) + while last_count: + cursor.execute(sql, { + 'target_id': target_obj.pk, + 'target_content_type_id': ContentType.get_for_model(target_obj).pk, + 'per_page': per_page, + 'offset': offset, + }) + page_count, size_sum = cursor.fetchall()[0] + storage_usage_total += (size_sum or 0) + last_count = (page_count or 0) + offset += last_count + return storage_usage_total + + +def get_storage_usage_total(target_obj): + _storage_usage_total = None + if settings.ENABLE_STORAGE_USAGE_CACHE: + _cache_key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_obj._id) + _storage_usage_total = storage_usage_cache.get(_cache_key) + if _storage_usage_total is None: + _storage_usage_total = compute_storage_usage_total(target_obj) + if settings.ENABLE_STORAGE_USAGE_CACHE: + storage_usage_cache.set(_cache_key, _storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT) + return _storage_usage_total def update_storage_usage(target): diff --git a/osf/metadata/osf_gathering.py b/osf/metadata/osf_gathering.py index 17fd4affa2c..afbb8dc198c 100644 --- a/osf/metadata/osf_gathering.py +++ b/osf/metadata/osf_gathering.py @@ -8,6 +8,7 @@ from django import db import rdflib +from api.caching.tasks import get_storage_usage_total from osf import models as osfdb from osf.metadata import gather from osf.metadata.rdfutils import ( @@ -212,19 +213,24 @@ def pls_get_magic_metadata_basket(osf_item) -> gather.Basket: OSFMAP_SUPPLEMENT = { OSF.Project: { OSF.hasOsfAddon: None, + OSF.storageByteCount: None, OSF.storageRegion: None, }, OSF.ProjectComponent: { OSF.hasOsfAddon: None, + OSF.storageByteCount: None, OSF.storageRegion: None, }, OSF.Registration: { + OSF.storageByteCount: None, OSF.storageRegion: None, }, OSF.RegistrationComponent: { + OSF.storageByteCount: None, OSF.storageRegion: None, }, OSF.Preprint: { + OSF.storageByteCount: None, OSF.storageRegion: None, }, OSF.File: { @@ -1149,3 +1155,13 @@ def gather_storage_region(focus): yield (OSF.storageRegion, _region_ref) yield (_region_ref, RDF.type, OSF.Region) yield (_region_ref, SKOS.prefLabel, rdflib.Literal(_region.name, lang='en')) + + +@gather.er( + OSF.storageByteCount, + focustype_iris=[OSF.Project, OSF.ProjectComponent, OSF.Registration, OSF.RegistrationComponent, OSF.Preprint] +) +def gather_storage_byte_count(focus): + _storage_usage_total = get_storage_usage_total(focus.dbmodel) + if _storage_usage_total is not None: + yield (OSF.storageByteCount, _storage_usage_total)