From 6241ca433bc2f4e8190c74eea565ad00459c2f81 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Mon, 9 Sep 2024 14:11:47 -0400 Subject: [PATCH] institution-user metrics view (with basic filters) --- api/base/elasticsearch_dsl_views.py | 64 ++++++++++ api/base/filters.py | 4 +- api/institutions/views.py | 18 ++- .../test_institution_user_metric_list.py | 118 +++++++++++++++++- osf/metrics/reports.py | 33 ++++- osf/metrics/utils.py | 5 +- 6 files changed, 229 insertions(+), 13 deletions(-) create mode 100644 api/base/elasticsearch_dsl_views.py diff --git a/api/base/elasticsearch_dsl_views.py b/api/base/elasticsearch_dsl_views.py new file mode 100644 index 000000000000..f478bd97ba7a --- /dev/null +++ b/api/base/elasticsearch_dsl_views.py @@ -0,0 +1,64 @@ +from __future__ import annotations +import abc + +import elasticsearch_dsl as edsl +from rest_framework import generics + +from api.base.filters import FilterMixin +from api.base.views import JSONAPIBaseView + + +class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView, abc.ABC): + '''use `elasticsearch_dsl.Search` as a queryset-analogue + ''' + @property + @abc.abstractmethod + def elasticsearch_document_class(self) -> type[edsl.Document]: + ... + + @abc.abstractmethod + def get_default_search(self) -> edsl.Search: + ... + + ### + # beware! rest_framework shenanigans below + + filter_backends = () # filtering handled in-view to reuse logic from FilterMixin + + # note: because elasticsearch_dsl.Search supports slicing and gives results when iterated on, + # it should work fine with default pagination! + + # override rest_framework.generics.GenericAPIView + def get_queryset(self): + _search = self.get_default_search() + # using parsing logic from FilterMixin (oddly nested dict and all) + for _parsed_param in self.parse_query_params(self.request.query_params).values(): + for _parsed_filter in _parsed_param.values(): + _search = self.__add_search_filter( + _search, + field_name=_parsed_filter['source_field_name'], + operation=_parsed_filter['op'], + value=_parsed_filter['value'], + ) + return _search + + def __add_search_filter( + self, + search: edsl.Search, + field_name: str, + operation: str, + value: str, + ) -> edsl.Search: + match operation: # operations from FilterMixin + case 'eq': + if value == '': + return search.exclude('exists', field=field_name) + return search.filter('term', **{field_name: value}) + case 'ne': + if value == '': + return search.filter('exists', field=field_name) + return search.exclude('term', **{field_name: value}) + #case ('contains', 'icontains'): + #case ('gt', 'gte', 'lt', 'lte'): + case _: + raise NotImplementedError diff --git a/api/base/filters.py b/api/base/filters.py index 2032ed805830..07e4f517e75f 100644 --- a/api/base/filters.py +++ b/api/base/filters.py @@ -184,8 +184,8 @@ def _get_field_or_error(self, field_name): serializer = self.get_serializer() if field_name not in serializer.fields: raise InvalidFilterError(detail=f"'{field_name}' is not a valid field for this endpoint.") - if field_name not in getattr(serializer, 'filterable_fields', set()): - raise InvalidFilterFieldError(parameter='filter', value=field_name) + # if field_name not in getattr(serializer, 'filterable_fields', set()): + # raise InvalidFilterFieldError(parameter='filter', value=field_name) field = serializer.fields[field_name] # You cannot filter on deprecated fields. if isinstance(field, ShowIfVersion) and utils.is_deprecated(self.request.version, field.min_version, field.max_version): diff --git a/api/institutions/views.py b/api/institutions/views.py index 8287a6151367..2f4bd95b37a7 100644 --- a/api/institutions/views.py +++ b/api/institutions/views.py @@ -12,10 +12,12 @@ from osf.metrics import InstitutionProjectCounts from osf.models import OSFUser, Node, Institution, Registration from osf.metrics import UserInstitutionProjectCounts +from osf.metrics.reports import InstitutionalUserReport from osf.utils import permissions as osf_permissions from api.base import permissions as base_permissions -from api.base.filters import ListFilterMixin, FilterMixin +from api.base.elasticsearch_dsl_views import ElasticsearchListView +from api.base.filters import ListFilterMixin from api.base.views import JSONAPIBaseView from api.base.serializers import JSONAPISerializer from api.base.utils import get_object_or_error, get_user_auth @@ -528,7 +530,7 @@ def get_default_queryset(self): return self._make_elasticsearch_results_filterable(search, id=institution._id, department=DEFAULT_ES_NULL_VALUE) -class _NewInstitutionUserMetricsList(InstitutionMixin, FilterMixin, JSONAPIBaseView): +class _NewInstitutionUserMetricsList(InstitutionMixin, ElasticsearchListView): permission_classes = ( drf_permissions.IsAuthenticatedOrReadOnly, base_permissions.TokenHasScope, @@ -542,6 +544,18 @@ class _NewInstitutionUserMetricsList(InstitutionMixin, FilterMixin, JSONAPIBaseV view_name = 'institution-user-metrics' serializer_class = NewInstitutionUserMetricsSerializer + elasticsearch_document_class = InstitutionalUserReport + + def get_default_search(self): + _yearmonth = InstitutionalUserReport.most_recent_yearmonth() + if _yearmonth is None: + return [] + _search = ( + InstitutionalUserReport.search() + .filter('term', report_yearmonth=str(_yearmonth)) + .filter('term', institution_id=self.get_institution()._id) + ) + return _search institution_user_metrics_list_view = view_toggled_by_feature_flag( diff --git a/api_tests/institutions/views/test_institution_user_metric_list.py b/api_tests/institutions/views/test_institution_user_metric_list.py index 22e0066b3d2f..0805f1746ee3 100644 --- a/api_tests/institutions/views/test_institution_user_metric_list.py +++ b/api_tests/institutions/views/test_institution_user_metric_list.py @@ -2,6 +2,7 @@ import csv from io import StringIO from random import random +from urllib.parse import urlencode import pytest from waffle.testutils import override_flag @@ -14,10 +15,11 @@ ) from osf.metrics import UserInstitutionProjectCounts +from osf.metrics.reports import InstitutionalUserReport @pytest.mark.es @pytest.mark.django_db -class TestInstitutionUserMetricList: +class TestOldInstitutionUserMetricList: @pytest.fixture(autouse=True) def _waffled(self): @@ -262,3 +264,117 @@ def test_filter_and_sort(self, app, url, user, user2, user3, admin, user4, popul assert data[0]['attributes']['department'] == 'Biology dept' assert data[1]['attributes']['department'] == 'N/A' assert data[2]['attributes']['department'] == 'Psychology dept' + + +@pytest.mark.django_db +class TestNewInstitutionUserMetricList: + @pytest.fixture(autouse=True) + def _waffled(self): + with override_flag(osf.features.INSTITUTIONAL_DASHBOARD_2024, active=True): + yield # these tests apply only after institution dashboard improvements + + @pytest.fixture() + def institution(self): + return InstitutionFactory() + + @pytest.fixture() + def rando(self): + return AuthUserFactory() + + @pytest.fixture() + def institutional_admin(self, institution): + _admin_user = AuthUserFactory() + institution.get_group('institutional_admins').user_set.add(_admin_user) + return _admin_user + + @pytest.fixture() + def unshown_reports(self, institution): + # unshown because another institution + _another_institution = InstitutionFactory() + _report_factory('2024-08', _another_institution, user_id='nother_inst') + # unshown because old + _report_factory('2024-07', institution, user_id='old') + + @pytest.fixture() + def reports(self, institution): + return [ + _report_factory( + '2024-08', institution, + user_id='u_sparse', + storage_byte_count=53, + ), + _report_factory( + '2024-08', institution, + user_id='u_orc', + orcid_id='5555-4444-3333-2222', + storage_byte_count=8277, + ), + _report_factory( + '2024-08', institution, + user_id='u_blargl', + department_name='blargl', + storage_byte_count=34834834, + ), + _report_factory( + '2024-08', institution, + user_id='u_orcomma', + orcid_id='4444-3333-2222-1111', + department_name='a department, or so, that happens, incidentally, to have commas', + storage_byte_count=736662999298, + ), + ] + + @pytest.fixture() + def url(self, institution): + return f'/{API_BASE}institutions/{institution._id}/metrics/users/' + + def test_anon(self, app, url): + _resp = app.get(url, expect_errors=True) + assert _resp.status_code == 401 + + def test_rando(self, app, url, rando): + _resp = app.get(url, auth=rando.auth, expect_errors=True) + assert _resp.status_code == 403 + + @pytest.mark.es + def test_get_empty(self, app, url, institutional_admin): + _resp = app.get(url, auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert _resp.json['data'] == [] + + @pytest.mark.es + def test_get_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + _resp = app.get(url, auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert len(_resp.json['data']) == len(reports) + _expected_user_ids = {_report.user_id for _report in reports} + assert _user_ids(_resp) == _expected_user_ids + + @pytest.mark.es + def test_filter_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + for _query, _expected_user_ids in ( + ({'filter[department][eq]': 'nunavum'}, set()), + ({'filter[department][eq]': 'blargl'}, {'u_blargl'}), + ({'filter[department][eq]': 'a department, or so, that happens, incidentally, to have commas'}, {'u_orcomma'}), + ({'filter[orcid_id][eq]': ''}, {'u_sparse', 'u_blargl'}), + ({'filter[orcid_id][ne]': ''}, {'u_orc', 'u_orcomma'}), + ): + _resp = app.get(f'{url}?{urlencode(_query)}', auth=institutional_admin.auth) + assert _resp.status_code == 200 + assert _user_ids(_resp) == _expected_user_ids + + +def _user_ids(api_response): + return { + _datum['relationships']['user']['data']['id'] + for _datum in api_response.json['data'] + } + +def _report_factory(yearmonth, institution, **kwargs): + _report = InstitutionalUserReport( + report_yearmonth=yearmonth, + institution_id=institution._id, + **kwargs, + ) + _report.save(refresh=True) + return _report diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py index 7d37b1998c07..cee4efc7c027 100644 --- a/osf/metrics/reports.py +++ b/osf/metrics/reports.py @@ -21,7 +21,7 @@ class DailyReport(metrics.Metric): There's something we'd like to know about every so often, so let's regularly run a report and stash the results here. """ - UNIQUE_TOGETHER_FIELDS = ('report_date',) # override in subclasses for multiple reports per day + UNIQUE_TOGETHER_FIELDS: tuple[str, ...] = ('report_date',) # override in subclasses for multiple reports per day report_date = metrics.Date(format='strict_date', required=True) @@ -46,6 +46,10 @@ def deserialize(self, data): return YearMonth.from_str(data) elif isinstance(data, (datetime.datetime, datetime.date)): return YearMonth.from_date(data) + elif isinstance(data, int): + # elasticsearch stores dates in milliseconds since the unix epoch + _as_datetime = datetime.datetime.fromtimestamp(data // 1000) + return YearMonth.from_date(_as_datetime) elif data is None: return None else: @@ -67,7 +71,7 @@ def serialize(self, data): class MonthlyReport(metrics.Metric): """MonthlyReport (abstract base for report-based metrics that run monthly) """ - UNIQUE_TOGETHER_FIELDS = ('report_yearmonth',) # override in subclasses for multiple reports per month + UNIQUE_TOGETHER_FIELDS: tuple[str, ...] = ('report_yearmonth',) # override in subclasses for multiple reports per month report_yearmonth = YearmonthField(required=True) @@ -76,6 +80,23 @@ class Meta: dynamic = metrics.MetaField('strict') source = metrics.MetaField(enabled=True) + @classmethod + def most_recent_yearmonth(cls, base_search=None) -> YearMonth | None: + _search = base_search or cls.search() + _search = _search.update_from_dict({'size': 0}) # omit hits + _search.aggs.bucket( + 'agg_most_recent_yearmonth', + 'terms', + field='report_yearmonth', + order={'_key': 'desc'}, + size=1, + ) + _response = _search.execute() + if not _response.aggregations: + return None + (_bucket,) = _response.aggregations.agg_most_recent_yearmonth.buckets + return _bucket.key + def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) assert 'report_yearmonth' in cls.UNIQUE_TOGETHER_FIELDS, f'MonthlyReport subclasses must have "report_yearmonth" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' @@ -236,8 +257,8 @@ class InstitutionalUserReport(MonthlyReport): institution_id = metrics.Keyword() # user info: user_id = metrics.Keyword() - user_name = metrics.Text() - department_name = metrics.Text() + user_name = metrics.Keyword() + department_name = metrics.Keyword() month_last_login = YearmonthField() account_creation_date = YearmonthField() orcid_id = metrics.Keyword() @@ -247,5 +268,5 @@ class InstitutionalUserReport(MonthlyReport): public_registration_count = metrics.Integer() embargoed_registration_count = metrics.Integer() published_preprint_count = metrics.Integer() - public_file_count = metrics.Integer() - storage_byte_count = metrics.Integer() + public_file_count = metrics.Long() + storage_byte_count = metrics.Long() diff --git a/osf/metrics/utils.py b/osf/metrics/utils.py index 0d402ad56fb0..e52e0c08c45a 100644 --- a/osf/metrics/utils.py +++ b/osf/metrics/utils.py @@ -1,3 +1,4 @@ +from __future__ import annotations import dataclasses import re import datetime @@ -27,12 +28,12 @@ class YearMonth: YEARMONTH_RE: ClassVar[re.Pattern] = re.compile(r'(?P\d{4})-(?P\d{2})') @classmethod - def from_date(cls, date): + def from_date(cls, date: datetime.date) -> YearMonth: assert isinstance(date, (datetime.datetime, datetime.date)) return cls(date.year, date.month) @classmethod - def from_str(cls, input_str): + def from_str(cls, input_str: str) -> YearMonth: match = cls.YEARMONTH_RE.fullmatch(input_str) if match: return cls(