Skip to content

Commit

Permalink
PublicItemUsageReport
Browse files Browse the repository at this point in the history
  • Loading branch information
aaxelb committed Sep 20, 2024
1 parent 884bdfc commit e76c74a
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 0 deletions.
106 changes: 106 additions & 0 deletions osf/metrics/reporters/public_item_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from __future__ import annotations
import typing

if typing.TYPE_CHECKING:
import elasticsearch_dsl

from osf.metrics.counted_usage import CountedAuthUsage
from osf.metrics.reports import PublicItemUsageReport
from osf.metrics.utils import YearMonth
from ._base import MonthlyReporter


_CHUNK_SIZE = 500


class PublicItemUsageReporter(MonthlyReporter):
'''build a PublicItemUsageReport for each public item
includes projects, project components, registrations, registration components, and preprints
'''

def report(self, yearmonth: YearMonth):
for _itembucket in self._iter_itembuckets(yearmonth):
yield self._report_from_itembucket(_itembucket)

def _item_page_search(self, yearmonth) -> elasticsearch_dsl.Search:
_usage_search = (
CountedAuthUsage.search()
.filter('term', item_public=True)
.filter('range', timestamp={
'gte': yearmonth.target_month(),
'lt': yearmonth.next_month(),
})
.update_from_dict({'size': 0}) # only aggregations, no hits
)
# the main agg: use a composite aggregation to page thru *every* item
_agg_items = _usage_search.aggs.bucket(
'agg_items',
'composite',
sources=[{'item_osfid': {'terms': {'field': 'item_guid'}}}],
size=_CHUNK_SIZE,
)
# nested agg: for each item, get platform_iri values
_agg_items.bucket('agg_platform_iri', 'terms', field='platform_iri')
# nested agg: for each item, get provider_id values
_agg_items.bucket('agg_provider_id', 'terms', field='provider_id')
# nested agg: for each item, get item_type values
_agg_items.bucket('agg_item_type', 'terms', field='item_type')
# nested agg: for each item, get view and download count
_agg_action = _agg_items.bucket(
'agg_action',
'terms',
field='action_labels',
include=[
CountedAuthUsage.ActionLabel.VIEW.value,
CountedAuthUsage.ActionLabel.DOWNLOAD.value,
],
)
# nested nested agg: for each item-action pair, get a session count
_agg_action.bucket(
'agg_session_count',
'cardinality',
field='session_id',
precision_threshold=40000, # maximum precision
)
return _usage_search

def _iter_itembuckets(self, yearmonth: YearMonth):
_search = self._item_page_search(yearmonth)
while _search is not None:
_page_response = _search.execute()
_agg_items = _page_response.aggregations.agg_items
yield from _agg_items.buckets
# update the search for the next page
if len(_agg_items.buckets) == _CHUNK_SIZE:
_search.aggs['agg_items'].after = _agg_items.after_key
else:
_search = None

def _report_from_itembucket(self, itembucket):
_report = PublicItemUsageReport(
item_osfid=itembucket.key.item_osfid,
item_type=_agg_keys(itembucket.agg_item_type),
provider_id=_agg_keys(itembucket.agg_provider_id),
platform_iri=_agg_keys(itembucket.agg_platform_iri),
# default counts to zero, will be updated if non-zero
view_count=0,
view_session_count=0,
download_count=0,
download_session_count=0,
)
for _actionbucket in itembucket.agg_action:
if _actionbucket.key == CountedAuthUsage.ActionLabel.VIEW.value:
_report.view_count = _actionbucket.doc_count
_report.view_session_count = _actionbucket.agg_session_count.value
elif _actionbucket.key == CountedAuthUsage.ActionLabel.DOWNLOAD.value:
_report.download_count = _actionbucket.doc_count
_report.download_session_count = _actionbucket.agg_session_count.value
return _report


###
# local helpers

def _agg_keys(bucket_agg_result) -> list:
return [_bucket.key for _bucket in bucket_agg_result]
23 changes: 23 additions & 0 deletions osf/metrics/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,3 +270,26 @@ class InstitutionalUserReport(MonthlyReport):
published_preprint_count = metrics.Integer()
public_file_count = metrics.Long()
storage_byte_count = metrics.Long()


class PublicItemUsageReport(MonthlyReport):
UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'item_osfid')

# where noted, fields correspond to defined terms from COUNTER
# https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html
item_osfid = metrics.Keyword() # counter:Item
item_type = metrics.Keyword(multi=True) # counter:Data-Type
provider_id = metrics.Keyword(multi=True) # counter:Database(?)
platform_iri = metrics.Keyword(multi=True) # counter:Platform

# counts for this item only (not including components or files)
view_count = metrics.Long() # counter:Total_Item_Investigations
view_session_count = metrics.Long() # counter:Unique_Item_Investigations
download_count = metrics.Long() # counter:Total_Item_Requests
download_session_count = metrics.Long() # counter:Unique_Item_Requests

# combined with counts for contained components and files
combined_view_count = metrics.Long()
combined_view_session_count = metrics.Long()
combined_download_count = metrics.Long()
combined_download_session_count = metrics.Long()

0 comments on commit e76c74a

Please sign in to comment.