diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py index 609e79fc324b..f51820fff663 100644 --- a/osf/metrics/reports.py +++ b/osf/metrics/reports.py @@ -1,3 +1,4 @@ +from collections import abc import datetime from django.dispatch import receiver @@ -20,10 +21,14 @@ class DailyReport(metrics.Metric): There's something we'd like to know about every so often, so let's regularly run a report and stash the results here. """ - DAILY_UNIQUE_FIELD = None # set in subclasses that expect multiple reports per day + UNIQUE_TOGETHER_FIELDS = ('report_date',) # override in subclasses for multiple reports per day report_date = metrics.Date(format='strict_date', required=True) + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + assert 'report_date' in cls.UNIQUE_TOGETHER_FIELDS, f'DailyReport subclasses must have "report_date" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' + class Meta: abstract = True dynamic = metrics.MetaField('strict') @@ -58,6 +63,7 @@ def serialize(self, data): class MonthlyReport(metrics.Metric): """MonthlyReport (abstract base for report-based metrics that run monthly) """ + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth',) # override in subclasses for multiple reports per month report_yearmonth = YearmonthField() @@ -66,26 +72,31 @@ class Meta: dynamic = metrics.MetaField('strict') source = metrics.MetaField(enabled=True) + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + assert 'report_yearmonth' in cls.UNIQUE_TOGETHER_FIELDS, f'MonthlyReport subclasses must have "report_yearmonth" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' + @receiver(metrics_pre_save) def set_report_id(sender, instance, **kwargs): - # Set the document id to a hash of "unique together" - # values (just `report_date` by default) to get - # "ON CONFLICT UPDATE" behavior -- if the document - # already exists, it will be updated rather than duplicated. - # Cannot detect/avoid conflicts this way, but that's ok. - - if issubclass(sender, DailyReport): - duf_name = instance.DAILY_UNIQUE_FIELD - if duf_name is None: - instance.meta.id = stable_key(instance.report_date) - else: - duf_value = getattr(instance, duf_name) - if not duf_value or not isinstance(duf_value, str): - raise ReportInvalid(f'{sender.__name__}.{duf_name} MUST have a non-empty string value (got {duf_value})') - instance.meta.id = stable_key(instance.report_date, duf_value) - elif issubclass(sender, MonthlyReport): - instance.meta.id = stable_key(instance.report_yearmonth) + try: + _unique_together_fields = instance.UNIQUE_TOGETHER_FIELDS + except AttributeError: + pass + else: + # Set the document id to a hash of "unique together" fields + # for "ON CONFLICT UPDATE" behavior -- if the document + # already exists, it will be updated rather than duplicated. + # Cannot detect/avoid conflicts this way, but that's ok. + _key_values = [] + for _field_name in _unique_together_fields: + _field_value = getattr(instance, _field_name) + if not _field_value or ( + isinstance(_field_value, abc.Collection) and not isinstance(_field_value, str) + ): + raise ReportInvalid(f'because "{_field_name}" is in {sender.__name__}.UNIQUE_TOGETHER_FIELDS, {sender.__name__}.{_field_name} MUST have a non-empty scalar value (got {_field_value} of type {type(_field_value)})') + _key_values.append(_field_value) + instance.meta.id = stable_key(*_key_values) #### BEGIN reusable inner objects ##### @@ -157,7 +168,7 @@ class DownloadCountReport(DailyReport): class InstitutionSummaryReport(DailyReport): - DAILY_UNIQUE_FIELD = 'institution_id' + UNIQUE_TOGETHER_FIELDS = ('report_date', 'institution_id',) institution_id = metrics.Keyword() institution_name = metrics.Keyword() @@ -169,7 +180,7 @@ class InstitutionSummaryReport(DailyReport): class NewUserDomainReport(DailyReport): - DAILY_UNIQUE_FIELD = 'domain_name' + UNIQUE_TOGETHER_FIELDS = ('report_date', 'domain_name',) domain_name = metrics.Keyword() new_user_count = metrics.Integer() @@ -187,7 +198,7 @@ class OsfstorageFileCountReport(DailyReport): class PreprintSummaryReport(DailyReport): - DAILY_UNIQUE_FIELD = 'provider_key' + UNIQUE_TOGETHER_FIELDS = ('report_date', 'provider_key',) provider_key = metrics.Keyword() preprint_count = metrics.Integer() diff --git a/osf/metrics/utils.py b/osf/metrics/utils.py index 5ea397fef39d..0d402ad56fb0 100644 --- a/osf/metrics/utils.py +++ b/osf/metrics/utils.py @@ -1,9 +1,8 @@ +import dataclasses import re import datetime -import typing from hashlib import sha256 - -import pytz +from typing import ClassVar def stable_key(*key_parts): @@ -20,11 +19,12 @@ def stable_key(*key_parts): return sha256(bytes(plain_key, encoding='utf')).hexdigest() -class YearMonth(typing.NamedTuple): +@dataclasses.dataclass(frozen=True) +class YearMonth: year: int month: int - YEARMONTH_RE = re.compile(r'(?P\d{4})-(?P\d{2})') + YEARMONTH_RE: ClassVar[re.Pattern] = re.compile(r'(?P\d{4})-(?P\d{2})') @classmethod def from_date(cls, date): @@ -46,9 +46,9 @@ def __str__(self): return f'{self.year}-{self.month:0>2}' def target_month(self): - return datetime.datetime(self.year, self.month, 1, tzinfo=pytz.utc) + return datetime.datetime(self.year, self.month, 1, tzinfo=datetime.UTC) def next_month(self): if self.month == 12: - return datetime.datetime(self.year + 1, 1, 1, tzinfo=pytz.utc) - return datetime.datetime(self.year, self.month + 1, 1, tzinfo=pytz.utc) + return datetime.datetime(self.year + 1, 1, 1, tzinfo=datetime.UTC) + return datetime.datetime(self.year, self.month + 1, 1, tzinfo=datetime.UTC) diff --git a/osf_tests/metrics/test_daily_report.py b/osf_tests/metrics/test_daily_report.py index 2089e7279c95..b33f731175ae 100644 --- a/osf_tests/metrics/test_daily_report.py +++ b/osf_tests/metrics/test_daily_report.py @@ -40,7 +40,7 @@ class Meta: def test_with_duf(self, mock_save): # multiple reports of this type per day, unique by given field class UniqueByDateAndField(DailyReport): - DAILY_UNIQUE_FIELD = 'duf' + UNIQUE_TOGETHER_FIELDS = ('report_date', 'duf',) duf = metrics.Keyword() class Meta: diff --git a/osf_tests/metrics/test_monthly_report.py b/osf_tests/metrics/test_monthly_report.py new file mode 100644 index 000000000000..3ee38e2b9f4b --- /dev/null +++ b/osf_tests/metrics/test_monthly_report.py @@ -0,0 +1,69 @@ +from unittest import mock + +import pytest +from elasticsearch_metrics import metrics + +from osf.metrics.reports import MonthlyReport, ReportInvalid +from osf.metrics.utils import YearMonth + + +class TestMonthlyReportKey: + @pytest.fixture + def mock_save(self): + with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: + yield mock_save + + def test_default(self, mock_save): + # only one of this type of report per month + class UniqueByMonth(MonthlyReport): + blah = metrics.Keyword() + + class Meta: + app_label = 'osf' + + yearmonth = YearMonth(2022, 5) + + reports = [ + UniqueByMonth(report_yearmonth=yearmonth), + UniqueByMonth(report_yearmonth=yearmonth, blah='blah'), + UniqueByMonth(report_yearmonth=yearmonth, blah='fleh'), + ] + expected_key = '8463aac67c1e5a038049196781d8f100f069225352d1829651892cf3fbfc50e2' + + for report in reports: + report.save() + assert mock_save.call_count == 1 + assert mock_save.call_args[0][0] is report + assert report.meta.id == expected_key + mock_save.reset_mock() + + def test_with_duf(self, mock_save): + # multiple reports of this type per day, unique by given field + class UniqueByMonthAndField(MonthlyReport): + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'duf',) + duf = metrics.Keyword() + + class Meta: + app_label = 'osf' + + yearmonth = YearMonth(2022, 5) + + expected_blah = '62ebf38317cd8402e27a50ce99f836d1734b3f545adf7d144d0e1cf37a0d9d08' + blah_report = UniqueByMonthAndField(report_yearmonth=yearmonth, duf='blah') + blah_report.save() + assert mock_save.call_count == 1 + assert mock_save.call_args[0][0] is blah_report + assert blah_report.meta.id == expected_blah + mock_save.reset_mock() + + expected_fleh = '385700db282f6d6089a0d21836db5ee8423f548615e515b6e034bcc90a14500f' + fleh_report = UniqueByMonthAndField(report_yearmonth=yearmonth, duf='fleh') + fleh_report.save() + assert mock_save.call_count == 1 + assert mock_save.call_args[0][0] is fleh_report + assert fleh_report.meta.id == expected_fleh + mock_save.reset_mock() + + bad_report = UniqueByMonthAndField(report_yearmonth=yearmonth) + with pytest.raises(ReportInvalid): + bad_report.save()