From 41191937feb7fdc61406cd35b349181cd849fb8d Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 19 Sep 2024 12:07:36 -0400 Subject: [PATCH] PublicItemUsageReport(er) + tests --- osf/metrics/reporters/public_item_usage.py | 106 +++++++++ osf/metrics/reports.py | 23 ++ .../test_public_item_usage_reporter.py | 203 ++++++++++++++++++ 3 files changed, 332 insertions(+) create mode 100644 osf/metrics/reporters/public_item_usage.py create mode 100644 osf_tests/metrics/reporters/test_public_item_usage_reporter.py diff --git a/osf/metrics/reporters/public_item_usage.py b/osf/metrics/reporters/public_item_usage.py new file mode 100644 index 00000000000..12282a27306 --- /dev/null +++ b/osf/metrics/reporters/public_item_usage.py @@ -0,0 +1,106 @@ +from __future__ import annotations +import typing + +if typing.TYPE_CHECKING: + import elasticsearch_dsl + +from osf.metrics.counted_usage import CountedAuthUsage +from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth +from ._base import MonthlyReporter + + +_CHUNK_SIZE = 500 + + +class PublicItemUsageReporter(MonthlyReporter): + '''build a PublicItemUsageReport for each public item + + includes projects, project components, registrations, registration components, and preprints + ''' + + def report(self, yearmonth: YearMonth): + for _itembucket in self._iter_itembuckets(yearmonth): + yield self._report_from_itembucket(_itembucket) + + def _item_page_search(self, yearmonth) -> elasticsearch_dsl.Search: + _usage_search = ( + CountedAuthUsage.search() + .filter('term', item_public=True) + .filter('range', timestamp={ + 'gte': yearmonth.target_month(), + 'lt': yearmonth.next_month(), + }) + .update_from_dict({'size': 0}) # only aggregations, no hits + ) + # the main agg: use a composite aggregation to page thru *every* item + _agg_items = _usage_search.aggs.bucket( + 'agg_items', + 'composite', + sources=[{'item_osfid': {'terms': {'field': 'item_guid'}}}], + size=_CHUNK_SIZE, + ) + # nested agg: for each item, get platform_iri values + _agg_items.bucket('agg_platform_iri', 'terms', field='platform_iri') + # nested agg: for each item, get provider_id values + _agg_items.bucket('agg_provider_id', 'terms', field='provider_id') + # nested agg: for each item, get item_type values + _agg_items.bucket('agg_item_type', 'terms', field='item_type') + # nested agg: for each item, get view and download count + _agg_action = _agg_items.bucket( + 'agg_action', + 'terms', + field='action_labels', + include=[ + CountedAuthUsage.ActionLabel.VIEW.value, + CountedAuthUsage.ActionLabel.DOWNLOAD.value, + ], + ) + # nested nested agg: for each item-action pair, get a session count + _agg_action.bucket( + 'agg_session_count', + 'cardinality', + field='session_id', + precision_threshold=40000, # maximum precision + ) + return _usage_search + + def _iter_itembuckets(self, yearmonth: YearMonth): + _search = self._item_page_search(yearmonth) + while _search is not None: + _page_response = _search.execute() + _agg_items = _page_response.aggregations.agg_items + yield from _agg_items.buckets + # update the search for the next page + if len(_agg_items.buckets) == _CHUNK_SIZE: + _search.aggs['agg_items'].after = _agg_items.after_key + else: + _search = None + + def _report_from_itembucket(self, itembucket): + _report = PublicItemUsageReport( + item_osfid=itembucket.key.item_osfid, + item_type=_agg_keys(itembucket.agg_item_type), + provider_id=_agg_keys(itembucket.agg_provider_id), + platform_iri=_agg_keys(itembucket.agg_platform_iri), + # default counts to zero, will be updated if non-zero + view_count=0, + view_session_count=0, + download_count=0, + download_session_count=0, + ) + for _actionbucket in itembucket.agg_action: + if _actionbucket.key == CountedAuthUsage.ActionLabel.VIEW.value: + _report.view_count = _actionbucket.doc_count + _report.view_session_count = _actionbucket.agg_session_count.value + elif _actionbucket.key == CountedAuthUsage.ActionLabel.DOWNLOAD.value: + _report.download_count = _actionbucket.doc_count + _report.download_session_count = _actionbucket.agg_session_count.value + return _report + + +### +# local helpers + +def _agg_keys(bucket_agg_result) -> list: + return [_bucket.key for _bucket in bucket_agg_result] diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py index cee4efc7c02..612544c3ba8 100644 --- a/osf/metrics/reports.py +++ b/osf/metrics/reports.py @@ -270,3 +270,26 @@ class InstitutionalUserReport(MonthlyReport): published_preprint_count = metrics.Integer() public_file_count = metrics.Long() storage_byte_count = metrics.Long() + + +class PublicItemUsageReport(MonthlyReport): + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'item_osfid') + + # where noted, fields correspond to defined terms from COUNTER + # https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html + item_osfid = metrics.Keyword() # counter:Item + item_type = metrics.Keyword(multi=True) # counter:Data-Type + provider_id = metrics.Keyword(multi=True) # counter:Database(?) + platform_iri = metrics.Keyword(multi=True) # counter:Platform + + # counts for this item only (not including components or files) + view_count = metrics.Long() # counter:Total_Item_Investigations + view_session_count = metrics.Long() # counter:Unique_Item_Investigations + download_count = metrics.Long() # counter:Total_Item_Requests + download_session_count = metrics.Long() # counter:Unique_Item_Requests + + # combined with counts for contained components and files + combined_view_count = metrics.Long() + combined_view_session_count = metrics.Long() + combined_download_count = metrics.Long() + combined_download_session_count = metrics.Long() diff --git a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py new file mode 100644 index 00000000000..84f1a91f60b --- /dev/null +++ b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py @@ -0,0 +1,203 @@ +from datetime import timedelta +from operator import attrgetter +from unittest import mock + +import pytest + +from osf.metrics.counted_usage import CountedAuthUsage +from osf.metrics.reporters.public_item_usage import PublicItemUsageReporter +from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth + + +@pytest.mark.es_metrics +class TestPublicItemUsageReport: + @pytest.fixture(autouse=True) + def _mocks(self): + # HACK: skip auto-filling fields from the database + with mock.patch('osf.metrics.counted_usage.Guid.load', return_value=None): + yield + + @pytest.fixture + def ym_empty(self) -> YearMonth: + return YearMonth(2012, 7) + + @pytest.fixture + def ym_sparse(self) -> YearMonth: + return YearMonth(2017, 7) + + @pytest.fixture + def ym_busy(self) -> YearMonth: + return YearMonth(2023, 7) + + @pytest.fixture + def sparse_month_usage(self, ym_sparse): + # "sparse" month: + # item0: 3 views, 0 downloads, 2 sessions + # item1: 1 views, 1 download, 1 session + _month_start = ym_sparse.target_month() + _save_usage( + timestamp=_month_start, + item_guid='item0', + session_id='sesh0', + action_labels=['view'], + ) + _save_usage( + timestamp=_month_start, + item_guid='item1', + session_id='sesh0', + action_labels=['view'], + ) + _save_usage( + timestamp=_month_start + timedelta(minutes=2), + item_guid='item0', + session_id='sesh0', + action_labels=['view'], + ) + _save_usage( + timestamp=_month_start + timedelta(minutes=3), + item_guid='item1', + session_id='sesh0', + action_labels=['download'], + ) + _save_usage( + timestamp=_month_start + timedelta(days=17), + item_guid='item0', + session_id='sesh1', + action_labels=['view'], + ) + + @pytest.fixture + def busy_month_item0(self, ym_busy): + # item0: 4 sessions, 4*7 views, 4*5 downloads + _month_start = ym_busy.target_month() + for _sesh in range(0, 4): + _sesh_start = _month_start + timedelta(days=_sesh) + for _minute in range(0, 7): + _save_usage( + timestamp=_sesh_start + timedelta(minutes=_minute), + item_guid='item0', + session_id=f'sesh0{_sesh}', + action_labels=['view'], + ) + for _minute in range(10, 15): + _save_usage( + timestamp=_sesh_start + timedelta(minutes=_minute), + item_guid='item0', + session_id=f'sesh0{_sesh}', + action_labels=['download'], + ) + + @pytest.fixture + def busy_month_item1(self, ym_busy): + # item1: 10 sessions, 6*9 views, 5*7 downloads, 2 providers + _month_start = ym_busy.target_month() + for _sesh in range(0, 6): + _sesh_start = _month_start + timedelta(days=_sesh) + for _minute in range(0, 9): + _save_usage( + timestamp=_sesh_start + timedelta(minutes=_minute), + item_guid='item1', + session_id=f'sesh1{_sesh}', + action_labels=['view'], + ) + for _sesh in range(5, 10): + _sesh_start = _month_start + timedelta(days=_sesh) + for _minute in range(10, 17): + _save_usage( + timestamp=_sesh_start + timedelta(minutes=_minute), + item_guid='item1', + session_id=f'sesh1{_sesh}', + action_labels=['download'], + provider_id='prov1', # additional provider_id + ) + + @pytest.fixture + def busy_month_item2(self, ym_busy): + # item2: 11 sessions, 11 views, 11 downloads + _month_start = ym_busy.target_month() + for _sesh in range(1, 12): + _save_usage( + timestamp=_month_start + timedelta(days=_sesh), + item_guid='item2', + session_id=f'sesh2{_sesh}', + action_labels=['view'], + ) + _save_usage( + timestamp=_month_start + timedelta(days=_sesh, hours=_sesh), + item_guid='item2', + session_id=f'sesh2{_sesh}', + action_labels=['download'], + ) + + def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_month_item0, busy_month_item1, busy_month_item2): + _reporter = PublicItemUsageReporter() + _empty = list(_reporter.report(ym_empty)) + _sparse = list(_reporter.report(ym_sparse)) + _busy = list(_reporter.report(ym_busy)) + + # empty month: + assert _empty == [] + + # sparse month: + assert len(_sparse) == 2 + _sparse_item0, _sparse_item1 = sorted(_sparse, key=attrgetter('item_osfid')) + # sparse-month item0 + assert isinstance(_sparse_item0, PublicItemUsageReport) + assert _sparse_item0.item_osfid == 'item0' + assert _sparse_item0.provider_id == ['prov0'] + assert _sparse_item0.platform_iri == ['http://osf.example'] + assert _sparse_item0.view_count == 3 + assert _sparse_item0.view_session_count == 2 + assert _sparse_item0.download_count == 0 + assert _sparse_item0.download_session_count == 0 + # sparse-month item1 + assert isinstance(_sparse_item1, PublicItemUsageReport) + assert _sparse_item1.item_osfid == 'item1' + assert _sparse_item1.provider_id == ['prov0'] + assert _sparse_item1.platform_iri == ['http://osf.example'] + assert _sparse_item1.view_count == 1 + assert _sparse_item1.view_session_count == 1 + assert _sparse_item1.download_count == 1 + assert _sparse_item1.download_session_count == 1 + + # busy month: + assert len(_busy) == 3 + _busy_item0, _busy_item1, _busy_item2 = sorted(_busy, key=attrgetter('item_osfid')) + # busy-month item0 + assert isinstance(_busy_item0, PublicItemUsageReport) + assert _busy_item0.item_osfid == 'item0' + assert _busy_item0.provider_id == ['prov0'] + assert _busy_item0.platform_iri == ['http://osf.example'] + assert _busy_item0.view_count == 4 * 7 + assert _busy_item0.view_session_count == 4 + assert _busy_item0.download_count == 4 * 5 + assert _busy_item0.download_session_count == 4 + # busy-month item1 + assert isinstance(_busy_item1, PublicItemUsageReport) + assert _busy_item1.item_osfid == 'item1' + assert _busy_item1.provider_id == ['prov0', 'prov1'] + assert _busy_item1.platform_iri == ['http://osf.example'] + assert _busy_item1.view_count == 6 * 9 + assert _busy_item1.view_session_count == 6 + assert _busy_item1.download_count == 5 * 7 + assert _busy_item1.download_session_count == 5 + # busy-month item2 + assert isinstance(_busy_item2, PublicItemUsageReport) + assert _busy_item2.item_osfid == 'item2' + assert _busy_item2.provider_id == ['prov0'] + assert _busy_item2.platform_iri == ['http://osf.example'] + assert _busy_item2.view_count == 11 + assert _busy_item2.view_session_count == 11 + assert _busy_item2.download_count == 11 + assert _busy_item2.download_session_count == 11 + + +def _save_usage(**kwargs): + _kwargs = { # overridable defaults: + 'platform_iri': 'http://osf.example', + 'item_public': True, + 'provider_id': 'prov0', + **kwargs, + } + CountedAuthUsage(**_kwargs).save(refresh=True)