diff --git a/.docker-compose.env b/.docker-compose.env index 9cb7a59e274..80eebc8707b 100644 --- a/.docker-compose.env +++ b/.docker-compose.env @@ -7,6 +7,8 @@ INTERNAL_DOMAIN=http://192.168.168.167:5000/ API_DOMAIN=http://localhost:8000/ ELASTIC_URI=192.168.168.167:9200 ELASTIC6_URI=192.168.168.167:9201 +ELASTIC8_URI=http://192.168.168.167:9202 +ELASTIC8_USERNAME=elastic OSF_DB_HOST=192.168.168.167 DB_HOST=192.168.168.167 REDIS_HOST=redis://192.168.168.167:6379 diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 00c5820b5b8..f147941c5ff 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -37,7 +37,19 @@ jobs: permissions: checks: write services: - postgres: + elasticsearch8: &ES8_SERVICE + image: elasticsearch:8.19.14 + ports: + - 9202:9200 + env: + discovery.type: single-node + xpack.security.enabled: false + options: >- + --health-cmd "curl -sf http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=30s" + --health-interval 10s + --health-timeout 30s + --health-retries 5 + postgres: &POSTGRES_SERVICE image: postgres env: POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }} @@ -54,6 +66,8 @@ jobs: - uses: ./.github/actions/start-build - name: Run tests run: poetry run python3 -m invoke test-ci-addons --junit + env: + ELASTIC8_URI: http://localhost:9202 - name: Upload report if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report @@ -64,18 +78,7 @@ jobs: permissions: checks: write services: - postgres: - image: postgres - env: - POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }} - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - # Maps tcp port 5432 on service container to the host - - 5432:5432 + postgres: *POSTGRES_SERVICE steps: - uses: actions/checkout@v6 - uses: ./.github/actions/start-build @@ -91,18 +94,8 @@ jobs: permissions: checks: write services: - postgres: - image: postgres - env: - POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }} - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - # Maps tcp port 5432 on service container to the host - - 5432:5432 + elasticsearch8: *ES8_SERVICE + postgres: *POSTGRES_SERVICE steps: - uses: actions/checkout@v6 - uses: ./.github/actions/start-build @@ -110,6 +103,8 @@ jobs: run: poetry run python3 -m invoke assets --dev - name: Run test run: poetry run python3 -m invoke test-ci-api1-and-js --junit + env: + ELASTIC8_URI: http://localhost:9202 - name: Upload report if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report @@ -120,23 +115,15 @@ jobs: permissions: checks: write services: - postgres: - image: postgres - env: - POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }} - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - # Maps tcp port 5432 on service container to the host - - 5432:5432 + elasticsearch8: *ES8_SERVICE + postgres: *POSTGRES_SERVICE steps: - uses: actions/checkout@v6 - uses: ./.github/actions/start-build - name: Run tests run: poetry run python3 -m invoke test-ci-api2 --junit + env: + ELASTIC8_URI: http://localhost:9202 - name: Upload report if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report @@ -147,19 +134,7 @@ jobs: checks: write needs: build-cache services: - postgres: - image: postgres - - env: - POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }} - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - # Maps tcp port 5432 on service container to the host - - 5432:5432 + postgres: *POSTGRES_SERVICE steps: - uses: actions/checkout@v6 - uses: ./.github/actions/start-build @@ -175,19 +150,7 @@ jobs: checks: write needs: build-cache services: - postgres: - image: postgres - - env: - POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }} - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - # Maps tcp port 5432 on service container to the host - - 5432:5432 + postgres: *POSTGRES_SERVICE mailhog: image: mailhog/mailhog ports: @@ -208,19 +171,7 @@ jobs: checks: write needs: build-cache services: - postgres: - image: postgres - - env: - POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }} - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - # Maps tcp port 5432 on service container to the host - - 5432:5432 + postgres: *POSTGRES_SERVICE steps: - uses: actions/checkout@v6 - uses: ./.github/actions/start-build diff --git a/addons/base/views.py b/addons/base/views.py index 8b4097244b3..12b78fb9957 100644 --- a/addons/base/views.py +++ b/addons/base/views.py @@ -14,7 +14,7 @@ import waffle from django.db import transaction from django.contrib.contenttypes.models import ContentType -from elasticsearch import exceptions as es_exceptions +from elasticsearch6 import exceptions as es_exceptions from rest_framework import status as http_status from api.caching.tasks import update_storage_usage_with_size @@ -34,6 +34,7 @@ from framework.flask import redirect from framework.sentry import log_exception from framework.transactions.handlers import no_auto_transaction +from osf.metrics.es8_metrics import OsfCountedUsageEvent from website import settings from addons.base import signals as file_signals from addons.base.utils import format_last_known_metadata, get_mfr_url @@ -691,6 +692,18 @@ def osfstoragefile_viewed_update_metrics(self, auth, fileversion, file_node): version=fileversion.identifier, path=file_node.path, ) + OsfCountedUsageEvent.record( + user_id=getattr(user, '_id', None), + item_osfid=resource._id, + action_labels=[ + OsfCountedUsageEvent.ActionLabel.VIEW.value, + OsfCountedUsageEvent.ActionLabel.WEB.value, + ], + # HACK: we don't have the user request, so fabricate a one-off session id + # (this means no double-click filtering for anonymous users (same as before) + # and potentially inflated "unique" sessionhour view counts) + client_session_id=str(uuid.uuid4()), + ) except es_exceptions.ConnectionError: log_exception() @@ -718,6 +731,17 @@ def osfstoragefile_downloaded_update_metrics(self, auth, fileversion, file_node) version=fileversion.identifier, path=file_node.path, ) + OsfCountedUsageEvent.record( + user_id=getattr(user, '_id', None), + item_osfid=resource._id, + action_labels=[ + OsfCountedUsageEvent.ActionLabel.DOWNLOAD.value, + ], + # HACK: we don't have the user request, so fabricate a one-off session id + # (this means no double-click filtering for anonymous users (same as before) + # and potentially inflated "unique" sessionhour view counts) + client_session_id=str(uuid.uuid4()), + ) except es_exceptions.ConnectionError: log_exception() diff --git a/admin/management/urls.py b/admin/management/urls.py index c046b3bed18..2e4cd7479a1 100644 --- a/admin/management/urls.py +++ b/admin/management/urls.py @@ -1,4 +1,4 @@ -from django.urls import re_path +from django.urls import re_path, path from admin.management import views @@ -21,5 +21,6 @@ re_path(r'^sync_notification_templates', views.SyncNotificationTemplates.as_view(), name='sync_notification_templates'), re_path(r'^remove_orcid_from_user_social', views.RemoveOrcidFromUserSocial.as_view(), - name='remove_orcid_from_user_social') + name='remove_orcid_from_user_social'), + path('migrate_osfmetrics_6to8', views.MigrateOsfmetrics6to8.as_view(), name='migrate_osfmetrics_6to8'), ] diff --git a/admin/management/views.py b/admin/management/views.py index f2052822f37..3c112347529 100644 --- a/admin/management/views.py +++ b/admin/management/views.py @@ -1,9 +1,12 @@ +from io import StringIO + from dateutil.parser import isoparse from django.views.generic import TemplateView, View from django.contrib import messages from django.http import HttpResponse from django.utils import timezone from django.contrib.auth.mixins import PermissionRequiredMixin +from django.core.management import call_command from osf.management.commands.manage_switch_flags import manage_waffle from osf.management.commands.update_registration_schemas import update_registration_schemas @@ -190,3 +193,23 @@ def post(self, request): remove_orcid_from_user_social() messages.success(request, 'Orcid from user social have been successfully removed.') return redirect(reverse('management:commands')) + + +class MigrateOsfmetrics6to8(ManagementCommandPermissionView): + def post(self, request): + _command_kwargs = { + 'no_setup': True, + 'no_color': True, + 'no_counts': request.POST.get('no_counts'), + 'clear_state': request.POST.get('clear_state'), + 'clear_es8_data': request.POST.get('clear_es8_data'), + 'start': request.POST.get('start'), + 'unchanged': request.POST.get('unchanged'), + 'usage_reports': request.POST.get('usage_reports'), + 'usage_events': request.POST.get('usage_events'), + } + _out_io = StringIO() + call_command('migrate_osfmetrics_6to8', **_command_kwargs, stdout=_out_io) + for _line in _out_io.getvalue().split('\n'): + messages.info(request, _line) + return redirect(reverse('management:commands')) diff --git a/admin/templates/management/commands.html b/admin/templates/management/commands.html index edf242abfdd..6b9ee927e0d 100644 --- a/admin/templates/management/commands.html +++ b/admin/templates/management/commands.html @@ -178,6 +178,31 @@

Remove existing orcid info from user social

+
+

migrate osf-metrics 6to8

+

+ view progress of the osf-metrics migration from elastic6 to elastic8 (or start it) +

+
+ {% csrf_token %} + + + + +
+ (narrow types: + + + + ) +
+ +
+
{% endblock %} diff --git a/api/base/elasticsearch_dsl_views.py b/api/base/elasticsearch_dsl_views.py index 6199fd82d0e..ecf2825d4e8 100644 --- a/api/base/elasticsearch_dsl_views.py +++ b/api/base/elasticsearch_dsl_views.py @@ -3,7 +3,7 @@ import datetime import typing -import elasticsearch_dsl as edsl +import elasticsearch6_dsl as edsl from rest_framework import generics, exceptions as drf_exceptions from rest_framework.settings import api_settings as drf_settings from api.base.settings.defaults import REPORT_FILENAME_FORMAT @@ -23,7 +23,7 @@ class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView, abc.ABC): - '''abstract view class using `elasticsearch_dsl.Search` as a queryset-analogue + '''abstract view class using `elasticsearch6_dsl.Search` as a queryset-analogue builds a `Search` based on `self.get_default_search()` and the request's query parameters for filtering, sorting, and pagination -- fetches only @@ -36,7 +36,7 @@ class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView, @abc.abstractmethod def get_default_search(self) -> edsl.Search | None: - '''the base `elasticsearch_dsl.Search` for this list, based on url path + '''the base `elasticsearch6_dsl.Search` for this list, based on url path (common jsonapi query parameters will be considered automatically) ''' @@ -95,7 +95,7 @@ def finalize_response(self, request, response, *args, **kwargs): # (filtering handled in-view to reuse logic from FilterMixin) filter_backends = () - # note: because elasticsearch_dsl.Search supports slicing and gives results when iterated on, + # note: because elasticsearch6_dsl.Search supports slicing and gives results when iterated on, # it works fine with default pagination # override rest_framework.generics.GenericAPIView diff --git a/api/base/settings/defaults.py b/api/base/settings/defaults.py index 52d30b40f9a..ac9a9739f1b 100644 --- a/api/base/settings/defaults.py +++ b/api/base/settings/defaults.py @@ -320,10 +320,27 @@ HASHIDS_SALT = 'pinkhimalayan' # django-elasticsearch-metrics -ELASTICSEARCH_DSL = { - 'default': { - 'hosts': osf_settings.ELASTIC6_URI, - 'retry_on_timeout': True, +DJELME_BACKENDS = { + 'osfmetrics_es6': { + 'elasticsearch_metrics.imps.elastic6': { + 'hosts': osf_settings.ELASTIC6_URI, + 'retry_on_timeout': True, + }, + }, + 'osfmetrics_es8': { + 'elasticsearch_metrics.imps.elastic8': { + # passthru kwargs to elasticsearch8 connection constructor + 'hosts': osf_settings.ELASTIC8_URI, + 'ca_certs': osf_settings.ELASTIC8_CERT_PATH, + 'basic_auth': ( + (osf_settings.ELASTIC8_USERNAME, osf_settings.ELASTIC8_SECRET) + if osf_settings.ELASTIC8_SECRET is not None + else None + ), + 'ssl_assert_hostname': osf_settings.ELASTIC8_ASSERT_HOSTNAME, + # djelme-specific kwargs + 'djelme_default_index_name_prefix': osf_settings.SHARE_PROVIDER_PREPEND, + }, }, } # Store yearly indices for time-series metrics diff --git a/api/metrics/serializers.py b/api/metrics/serializers.py index 5bbde293505..9e3f61f5b50 100644 --- a/api/metrics/serializers.py +++ b/api/metrics/serializers.py @@ -6,6 +6,10 @@ from api.base.serializers import BaseAPISerializer from api.base.utils import absolute_reverse from osf.metrics.counted_usage import CountedAuthUsage, PageviewInfo +from osf.metrics.es8_metrics import ( + OsfCountedUsageEvent, + PageviewInfo as PageviewInfoEs8, +) from website import settings as website_settings logger = logging.getLogger(__name__) @@ -42,7 +46,7 @@ class PageviewInfoSerializer(ser.Serializer): class CountedAuthUsageSerializer(ser.Serializer): - item_guid = ser.CharField(max_length=255, required=False) + item_guid = ser.CharField(max_length=255, required=True) client_session_id = ser.CharField(max_length=255, required=False) provider_id = ser.CharField(max_length=255, required=False) @@ -64,8 +68,21 @@ def validate(self, data): def create(self, validated_data): pageview_info = None + pageview_info_es8 = None if pageview_info_data := validated_data.get('pageview_info'): pageview_info = PageviewInfo(**pageview_info_data) + pageview_info_es8 = PageviewInfoEs8(**pageview_info_data) + OsfCountedUsageEvent.record( + item_osfid=validated_data['item_guid'], + action_labels=validated_data.get('action_labels'), + provider_id=validated_data.get('provider_id'), + pageview_info=pageview_info_es8, + # used to create a COUNTER session-hour id, not stored: + client_session_id=validated_data.get('client_session_id'), + user_id=self.context.get('user_id'), + request_host=self.context.get('request_host'), + request_useragent=self.context.get('request_useragent'), + ) return CountedAuthUsage.record( platform_iri=website_settings.DOMAIN, provider_id=validated_data.get('provider_id'), diff --git a/api/metrics/urls.py b/api/metrics/urls.py index e135212541c..db63df3dd4c 100644 --- a/api/metrics/urls.py +++ b/api/metrics/urls.py @@ -5,7 +5,9 @@ app_name = 'osf' urlpatterns = [ - re_path(r'^raw/(?P[a-z0-9._/]*)$', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name), + re_path(r'^raw/(?P[a-z0-9._/]*)$', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name, kwargs={'djelme_backend_name': 'osfmetrics_es6'}), + path('raw-/', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name, kwargs={'url_path': ''}), + path('raw-/', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name), re_path(r'^preprints/views/$', views.PreprintViewMetrics.as_view(), name=views.PreprintViewMetrics.view_name), re_path(r'^preprints/downloads/$', views.PreprintDownloadMetrics.as_view(), name=views.PreprintDownloadMetrics.view_name), re_path(r'^registries_moderation/transitions/$', views.RegistriesModerationMetricsView.as_view(), name=views.RegistriesModerationMetricsView.view_name), diff --git a/api/metrics/views.py b/api/metrics/views.py index 99ecf3fe347..bd53bee296e 100644 --- a/api/metrics/views.py +++ b/api/metrics/views.py @@ -6,8 +6,9 @@ from django.http import JsonResponse, HttpResponse, Http404 from django.utils import timezone -from elasticsearch.exceptions import NotFoundError, RequestError -from elasticsearch_dsl.connections import get_connection +from elasticsearch6.exceptions import NotFoundError, RequestError +from elasticsearch6_dsl.connections import get_connection +from elasticsearch_metrics.registry import djelme_registry from framework.auth.oauth_scopes import CoreScopes @@ -226,24 +227,49 @@ def delete(self, request, *args, **kwargs): raise ValidationError('DELETE not supported. Use GET/POST/PUT') @require_switch(ENABLE_RAW_METRICS) - def get(self, request, *args, **kwargs): - connection = get_connection() - url_path = kwargs['url_path'] - return JsonResponse(connection.transport.perform_request('GET', f'/{url_path}')) + def get(self, request, *args, djelme_backend_name, url_path, **kwargs): + _response_body = self._do_es_request( + djelme_backend_name, + method='GET', + path=url_path, + qp=request.GET, + ) + return JsonResponse(_response_body) @require_switch(ENABLE_RAW_METRICS) - def post(self, request, *args, **kwargs): - connection = get_connection() - url_path = kwargs['url_path'] - body = json.loads(request.body) - return JsonResponse(connection.transport.perform_request('POST', f'/{url_path}', body=body)) + def post(self, request, *args, djelme_backend_name, url_path, **kwargs): + _response_body = self._do_es_request( + djelme_backend_name, + method='POST', + path=url_path, + qp=request.GET, + body=json.loads(request.body), + ) + return JsonResponse(_response_body) @require_switch(ENABLE_RAW_METRICS) - def put(self, request, *args, **kwargs): - connection = get_connection() - url_path = kwargs['url_path'] - body = json.loads(request.body) - return JsonResponse(connection.transport.perform_request('PUT', f'/{url_path}', body=body)) + def put(self, request, *args, djelme_backend_name, url_path, **kwargs): + _response_body = self._do_es_request( + djelme_backend_name, + method='PUT', + path=url_path, + qp=request.GET, + body=json.loads(request.body), + ) + return JsonResponse(_response_body) + + def _do_es_request(self, djelme_backend_name, method, path, qp, body=None): + _client = self._get_es_client(djelme_backend_name) + _perform_fn = getattr(_client, 'perform_request', None) or _client.transport.perform_request + _response = _perform_fn(method, f'/{path}', params=qp.dict(), body=body) + return _response if isinstance(_response, dict) else _response.body + + def _get_es_client(self, djelme_backend_name): + try: + _backend = djelme_registry.get_backend(djelme_backend_name) + except LookupError: + raise Http404 + return _backend.elastic_client class RegistriesModerationMetricsView(GenericAPIView): @@ -387,7 +413,14 @@ class CountedAuthUsageView(JSONAPIBaseView): serializer_class = CountedAuthUsageSerializer def post(self, request, *args, **kwargs): - serializer = self.serializer_class(data=request.data) + serializer = self.serializer_class( + data=request.data, + context={ + 'user_id': request.user._id if request.user.is_authenticated else None, + 'request_host': request.get_host(), + 'request_useragent': request.META.get('HTTP_USER_AGENT', ''), + }, + ) serializer.is_valid(raise_exception=True) if should_skip_counted_usage( request.user, @@ -403,6 +436,8 @@ def post(self, request, *args, **kwargs): return HttpResponse(status=201) def _get_session_id(self, request, client_session_id=None): + # NOTE: to remove after osfmetrics 6to8 migration -- logic moved to djelme + # get a session id as described in the COUNTER code of practice: # https://cop5.projectcounter.org/en/5.0.2/07-processing/03-counting-unique-items.html # -- different from the "login session" tracked by `osf.models.Session` (which diff --git a/api_tests/institutions/views/test_institution_department_list.py b/api_tests/institutions/views/test_institution_department_list.py index c2a5c0fcf99..8b785504756 100644 --- a/api_tests/institutions/views/test_institution_department_list.py +++ b/api_tests/institutions/views/test_institution_department_list.py @@ -44,7 +44,7 @@ def populate_counts(self, user, user2, user3, user4, admin, institution): department_name='Old Department', public_project_count=1, private_project_count=1, - ).save(refresh=True) + ).save() _this_month = YearMonth.from_date(datetime.date.today()) @@ -56,7 +56,7 @@ def populate_counts(self, user, user2, user3, user4, admin, institution): department_name='New Department', public_project_count=1, private_project_count=1, - ).save(refresh=True) + ).save() # A second user entered the department InstitutionalUserReport( @@ -66,7 +66,7 @@ def populate_counts(self, user, user2, user3, user4, admin, institution): department_name='New Department', public_project_count=1, private_project_count=1, - ).save(refresh=True) + ).save() # A new department with a single user to test sorting InstitutionalUserReport( @@ -76,7 +76,7 @@ def populate_counts(self, user, user2, user3, user4, admin, institution): department_name='Smaller Department', public_project_count=1, private_project_count=1, - ).save(refresh=True) + ).save() # A user with no department InstitutionalUserReport( @@ -85,7 +85,7 @@ def populate_counts(self, user, user2, user3, user4, admin, institution): institution_id=institution._id, public_project_count=1, private_project_count=1, - ).save(refresh=True) + ).save() @pytest.fixture() def admin(self, institution): @@ -113,6 +113,7 @@ def test_auth(self, app, url, user, admin): assert resp.json['data'] == [] def test_get(self, app, url, admin, institution, populate_counts): + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) resp = app.get(url, auth=admin.auth) assert resp.json['data'] == [{ diff --git a/api_tests/institutions/views/test_institution_summary_metrics.py b/api_tests/institutions/views/test_institution_summary_metrics.py index 41983458d2e..6dd6c5bbda3 100644 --- a/api_tests/institutions/views/test_institution_summary_metrics.py +++ b/api_tests/institutions/views/test_institution_summary_metrics.py @@ -84,6 +84,7 @@ def test_get_empty(self, app, url, institutional_admin): assert resp.json['meta'] == {'version': '2.0'} def test_get_report(self, app, url, institutional_admin, institution, reports, unshown_reports): + InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) resp = app.get(url, auth=institutional_admin.auth) assert resp.status_code == 200 @@ -149,6 +150,7 @@ def test_get_report_with_multiple_months_and_institutions( monthly_logged_in_user_count=270, monthly_active_user_count=260, ) + InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) resp = app.get(url, auth=institutional_admin.auth) assert resp.status_code == 200 @@ -189,6 +191,7 @@ def test_get_with_valid_report_dates(self, app, url, institution, institutional_ institution, user_count=4133, ) + InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) resp = app.get(f'{url}?report_yearmonth=2024-08', auth=institutional_admin.auth) assert resp.status_code == 200 @@ -213,6 +216,7 @@ def test_get_with_invalid_report_date(self, app, url, institution, institutional institution, user_count=999, ) + InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) # Request with an invalid report_date format resp = app.get(f'{url}?report_yearmonth=invalid-date', auth=institutional_admin.auth) @@ -233,6 +237,7 @@ def test_get_without_report_date_uses_most_recent(self, app, url, institution, i institution, user_count=999, ) + InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) resp = app.get(url, auth=institutional_admin.auth) assert resp.status_code == 200 @@ -247,5 +252,5 @@ def _summary_report_factory(yearmonth, institution, **kwargs): institution_id=institution._id, **kwargs, ) - report.save(refresh=True) + report.save() return report diff --git a/api_tests/institutions/views/test_institution_user_metric_list.py b/api_tests/institutions/views/test_institution_user_metric_list.py index 0826dcd0161..d2b99da435f 100644 --- a/api_tests/institutions/views/test_institution_user_metric_list.py +++ b/api_tests/institutions/views/test_institution_user_metric_list.py @@ -89,6 +89,7 @@ def test_get_empty(self, app, url, institutional_admin): assert _resp.json['data'] == [] def test_get_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) _resp = app.get(url, auth=institutional_admin.auth) assert _resp.status_code == 200 assert len(_resp.json['data']) == len(reports) @@ -100,6 +101,7 @@ def test_get_reports(self, app, url, institutional_admin, institution, reports, assert len(response_object['attributes']['contacts']) == 0 def test_filter_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) for _query, _expected_user_ids in ( ({'filter[department]': 'nunavum'}, set()), ({'filter[department]': 'incidentally'}, set()), @@ -135,6 +137,7 @@ def test_filter_reports(self, app, url, institutional_admin, institution, report assert set(_user_ids(_resp)) == _expected_user_ids def test_sort_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) for _query, _expected_user_id_list in ( ({'sort': 'storage_byte_count'}, ['u_sparse', 'u_orc', 'u_blargl', 'u_orcomma']), ({'sort': '-storage_byte_count'}, ['u_orcomma', 'u_blargl', 'u_orc', 'u_sparse']), @@ -144,6 +147,7 @@ def test_sort_reports(self, app, url, institutional_admin, institution, reports, assert list(_user_ids(_resp)) == _expected_user_id_list def test_paginate_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) for _query, _expected_user_id_list in ( ({'sort': 'storage_byte_count', 'page[size]': 2}, ['u_sparse', 'u_orc']), ({'sort': 'storage_byte_count', 'page[size]': 2, 'page': 2}, ['u_blargl', 'u_orcomma']), @@ -178,6 +182,7 @@ def test_get_report_formats_csv_tsv(self, app, url, institutional_admin, institu month_last_active='2018-02', month_last_login='2018-02', ) + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth) assert resp.status_code == 200 @@ -281,6 +286,7 @@ def test_csv_tsv_ignores_pagination(self, app, url, institutional_admin, institu str(736662999298 + i), f'Jalen Hurts #{i}', ]) + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) # Make request for CSV format with page[size]=10 resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth) @@ -346,6 +352,7 @@ def test_get_report_format_table_json(self, app, url, institutional_admin, insti month_last_active='2018-02', month_last_login='2018-02', ) + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) resp = app.get(f'{url}?format=json_report', auth=institutional_admin.auth) assert resp.status_code == 200 @@ -411,6 +418,7 @@ def test_correct_number_of_contact_messages(self, app, url, institutional_admin, department_name='a department, or so, that happens, incidentally, to have commas', storage_byte_count=736662999298, ) + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) receiver = user1 with capture_notifications(): @@ -477,5 +485,5 @@ def _report_factory(yearmonth, institution, **kwargs): institution_id=institution._id, **kwargs, ) - _report.save(refresh=True) + _report.save() return _report diff --git a/api_tests/metrics/test_composite_query.py b/api_tests/metrics/test_composite_query.py index 0cd0b3bb180..016677c3a11 100644 --- a/api_tests/metrics/test_composite_query.py +++ b/api_tests/metrics/test_composite_query.py @@ -1,4 +1,3 @@ -import time import pytest from datetime import datetime from osf_tests.factories import ( @@ -75,7 +74,7 @@ def test_elasticsearch_agg_query(self, app, user, base_url, preprint): path=preprint.primary_file.path, timestamp=datetime(year=2020, month=2, day=1) ) - time.sleep(1) # gives ES some time to update + PreprintDownload._get_connection().indices.refresh(PreprintDownload._template_pattern) resp = app.post_json_api(post_url, payload, auth=user.auth) assert resp.status_code == 200 diff --git a/api_tests/metrics/test_counted_usage.py b/api_tests/metrics/test_counted_usage.py index be1d986ff6d..e954248c15b 100644 --- a/api_tests/metrics/test_counted_usage.py +++ b/api_tests/metrics/test_counted_usage.py @@ -16,6 +16,7 @@ ) from osf.utils.permissions import ADMIN, READ, WRITE from api_tests.utils import create_test_file +from elasticsearch_metrics.tests.util import djelme_test_backends COUNTED_USAGE_URL = '/_/metrics/events/counted_usage/' @@ -43,8 +44,9 @@ def assert_saved_with(mock_save, *, expected_doc_id=None, expected_attrs): @pytest.fixture def mock_save(): - with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: - yield mock_save + with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'): + with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: + yield mock_save @pytest.mark.django_db @@ -73,6 +75,12 @@ def test_required_attributes(self, app, attrs): @pytest.mark.django_db class TestComputedFields: + + @pytest.fixture(autouse=True) + def _real_elastic(self): + with djelme_test_backends(): + yield + @pytest.fixture(autouse=True) def mock_domain(self): domain = 'http://example.foo/' @@ -85,15 +93,22 @@ def mock_now(self): with mock.patch('django.utils.timezone.now', return_value=timestamp): yield timestamp + @pytest.fixture + def preprint(self, request): + return PreprintFactory( + is_public=True, + is_published=True, + ) + @pytest.fixture() def user(self): with mock.patch('osf.models.base.generate_guid', return_value='guidy'): return AuthUserFactory() - def test_by_client_session_id(self, app, mock_save, user): + def test_by_client_session_id(self, app, mock_save, user, preprint): payload = counted_usage_payload( client_session_id='hello', - item_guid='zyxwv', + item_guid=preprint._id, action_labels=['view', 'api'], pageview_info={'page_url': 'http://example.foo/blahblah/blee'}, ) @@ -108,7 +123,7 @@ def test_by_client_session_id(self, app, mock_save, user): expected_doc_id='3239044c7462dd318edd0522a0ed7d84b9c6502ef16cb40dfcae6c1f456d57a2', expected_attrs={ 'platform_iri': 'http://example.foo/', - 'item_guid': 'zyxwv', + 'item_guid': preprint._id, # session_id: sha256(b'hello|1981-01-01').hexdigest() 'session_id': '5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34', 'action_labels': ['view', 'api'], @@ -120,10 +135,10 @@ def test_by_client_session_id(self, app, mock_save, user): }, ) - def test_by_client_session_id_anon(self, app, mock_save): + def test_by_client_session_id_anon(self, app, mock_save, preprint): payload = counted_usage_payload( client_session_id='hello', - item_guid='zyxwv', + item_guid=preprint._id, action_labels=['view', 'web'], pageview_info={ 'page_url': 'http://example.foo/bliz/', @@ -141,7 +156,7 @@ def test_by_client_session_id_anon(self, app, mock_save): expected_doc_id='d01759e963893f9dc9b2ccf016a5ef29135673779802b5578f31449543677e82', expected_attrs={ 'platform_iri': 'http://example.foo/', - 'item_guid': 'zyxwv', + 'item_guid': preprint._id, # session_id: sha256(b'hello|1981-01-01').hexdigest() 'session_id': '5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34', 'action_labels': ['view', 'web'], @@ -155,9 +170,9 @@ def test_by_client_session_id_anon(self, app, mock_save): }, ) - def test_by_user_auth(self, app, mock_save, user): + def test_by_user_auth(self, app, mock_save, user, preprint): payload = counted_usage_payload( - item_guid='yxwvu', + item_guid=preprint._id, action_labels=['view', 'web'], pageview_info={ 'page_url': 'http://osf.io/mst3k', @@ -175,7 +190,7 @@ def test_by_user_auth(self, app, mock_save, user): expected_doc_id='7b8bc27c6d90fb45aa5bbd02deceba9f7384ed61b9a6e7253317c262020b94c2', expected_attrs={ 'platform_iri': 'http://example.foo/', - 'item_guid': 'yxwvu', + 'item_guid': preprint._id, # session_id: sha256(b'guidy|1981-01-01|0').hexdigest() 'session_id': 'ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a', 'action_labels': ['view', 'web'], @@ -189,10 +204,14 @@ def test_by_user_auth(self, app, mock_save, user): }, ) - def test_by_useragent_header(self, app, mock_save): + def test_by_useragent_header(self, app, mock_save, preprint): payload = counted_usage_payload( - item_guid='yxwvu', + item_guid=preprint._id, action_labels=['view', 'api'], + pageview_info={ + 'page_url': 'http://example.foo/bliz/', + 'referer_url': 'http://elsewhere.baz/index.php', + }, ) headers = { 'User-Agent': 'haha', @@ -202,14 +221,20 @@ def test_by_useragent_header(self, app, mock_save): assert_saved_with( mock_save, # doc_id: sha256(b'http://example.foo/|yxwvu|97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a|1981-01-01|3|api,view').hexdigest() - expected_doc_id='d669528b30f443ffe506e183537af9624ef290090e90a200ecce7b7ca19c77f7', + expected_doc_id='6d7549df6734bb955eb832c6316ffae46c2959c95b5817ab4fcb341dbc875c23', expected_attrs={ 'platform_iri': 'http://example.foo/', - 'item_guid': 'yxwvu', + 'item_guid': preprint._id, # session_id: sha256(b'localhost:80|haha|1981-01-01|0').hexdigest() 'session_id': '97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a', 'action_labels': ['view', 'api'], - 'pageview_info': None, + 'pageview_info': { + 'page_url': 'http://example.foo/bliz/', + 'page_path': '/bliz', + 'referer_url': 'http://elsewhere.baz/index.php', + 'referer_domain': 'elsewhere.baz', + 'hour_of_day': 0, + }, }, ) @@ -217,6 +242,12 @@ def test_by_useragent_header(self, app, mock_save): @pytest.mark.parametrize('item_public', [True, False]) @pytest.mark.django_db class TestGuidFields: + + @pytest.fixture(autouse=True) + def _real_elastic(self): + with djelme_test_backends(): + yield + @pytest.fixture def preprint(self, item_public): return PreprintFactory( @@ -261,7 +292,7 @@ def test_preprint_file(self, app, mock_save, preprint, item_public): item_guid=preprint._id, action_labels=['view', 'web'], ) - resp = app.post_json_api(COUNTED_USAGE_URL, payload) + resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'}) assert resp.status_code == 201 assert_saved_with( mock_save, @@ -280,7 +311,7 @@ def test_preprint_file(self, app, mock_save, preprint, item_public): item_guid=preprint.primary_file.get_guid(create=True)._id, action_labels=['view', 'web'], ) - resp = app.post_json_api(COUNTED_USAGE_URL, payload) + resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'}) assert resp.status_code == 201 assert_saved_with( mock_save, @@ -299,7 +330,7 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil item_guid=child_reg_file_guid, action_labels=['view', 'web'], ) - resp = app.post_json_api(COUNTED_USAGE_URL, payload) + resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'}) assert resp.status_code == 201 assert_saved_with( mock_save, @@ -322,7 +353,7 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil item_guid=child_reg._id, action_labels=['view', 'web'], ) - resp = app.post_json_api(COUNTED_USAGE_URL, payload) + resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'}) assert resp.status_code == 201 assert_saved_with( mock_save, @@ -344,7 +375,7 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil item_guid=parent_reg._id, action_labels=['view', 'web'], ) - resp = app.post_json_api(COUNTED_USAGE_URL, payload) + resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'}) assert resp.status_code == 201 assert_saved_with( mock_save, diff --git a/api_tests/metrics/test_preprint_metrics.py b/api_tests/metrics/test_preprint_metrics.py index 1bde8719b75..cd9b8041c2d 100644 --- a/api_tests/metrics/test_preprint_metrics.py +++ b/api_tests/metrics/test_preprint_metrics.py @@ -8,7 +8,7 @@ from django.utils import timezone from waffle.testutils import override_switch -from elasticsearch.exceptions import RequestError +from elasticsearch6.exceptions import RequestError from osf import features from api.base.settings import API_PRIVATE_BASE as API_BASE diff --git a/api_tests/metrics/test_raw_metrics.py b/api_tests/metrics/test_raw_metrics.py index 6a3b9b8f8c5..e32936d9024 100644 --- a/api_tests/metrics/test_raw_metrics.py +++ b/api_tests/metrics/test_raw_metrics.py @@ -1,10 +1,10 @@ import pytest -import time from website.app import setup_django setup_django() from waffle.testutils import override_switch +from elasticsearch6_dsl.connections import connections as es6_connections from osf import features from osf_tests.factories import AuthUserFactory @@ -40,9 +40,9 @@ def user(self): def other_user(self): return AuthUserFactory() - @pytest.fixture - def base_url(self): - return f'/{API_BASE}metrics/raw/' + @pytest.fixture(params=['raw', 'raw-osfmetrics_es6']) + def base_url(self, request): + return f'/{API_BASE}metrics/{request.param}/' def test_delete(self, app, user, base_url): res = app.delete_json_api(base_url, auth=user.auth, expect_errors=True) @@ -136,7 +136,9 @@ def test_post_and_get(self, app, user, base_url): res = app.post_json_api(post_url, post_data, auth=user.auth) assert res.json == post_return - time.sleep(3) + es6_connections.get_connection('osfmetrics_es6').indices.refresh( + index='customer', + ) get_url = f'{base_url}customer/_search?q=*' res = app.get(get_url, auth=user.auth) diff --git a/api_tests/metrics/test_registries_moderation_metrics.py b/api_tests/metrics/test_registries_moderation_metrics.py index 93469b1b3b5..f5d3a047b10 100644 --- a/api_tests/metrics/test_registries_moderation_metrics.py +++ b/api_tests/metrics/test_registries_moderation_metrics.py @@ -1,8 +1,5 @@ import pytest -from waffle.testutils import override_switch -import time -from osf import features from osf_tests.factories import RegistrationFactory, AuthUserFactory from osf.utils.workflows import RegistrationModerationStates, RegistrationModerationTriggers from osf.metrics import RegistriesModerationMetrics @@ -18,11 +15,6 @@ class TestRegistrationModerationMetrics: def registration(self): return RegistrationFactory() - @pytest.fixture(autouse=True) - def enable_elasticsearch_metrics(self): - with override_switch(features.ELASTICSEARCH_METRICS, active=True): - yield - @pytest.mark.es_metrics def test_record_transitions(self, registration): with capture_notifications(): @@ -32,7 +24,7 @@ def test_record_transitions(self, registration): registration.creator, 'Metrics is easy' ) - time.sleep(1) + RegistriesModerationMetrics._get_connection().indices.refresh(RegistriesModerationMetrics._template_pattern) assert RegistriesModerationMetrics.search().count() == 1 data = RegistriesModerationMetrics.search().execute()['hits']['hits'][0]['_source'] @@ -51,11 +43,6 @@ class TestRegistrationModerationMetricsView: def registration(self): return RegistrationFactory() - @pytest.fixture(autouse=True) - def enable_elasticsearch_metrics(self): - with override_switch(features.ELASTICSEARCH_METRICS, active=True): - yield - @pytest.fixture def user(self): user = AuthUserFactory() @@ -81,7 +68,7 @@ def test_registries_moderation_view(self, app, user, base_url, registration): registration.creator, 'Metrics is easy' ) - time.sleep(1) + RegistriesModerationMetrics._get_connection().indices.refresh(RegistriesModerationMetrics._template_pattern) res = app.get(base_url, auth=user.auth, expect_errors=True) data = res.json diff --git a/conftest.py b/conftest.py index 9494e3d296e..e80c4e5c566 100644 --- a/conftest.py +++ b/conftest.py @@ -1,24 +1,24 @@ -import contextlib from unittest import mock import logging import os import re -from django.core.management import call_command from django.db import transaction -from elasticsearch import exceptions as es_exceptions -from elasticsearch_dsl.connections import connections -from elasticsearch_metrics.registry import registry as es_metrics_registry +from elasticsearch6_dsl.connections import connections +from elasticsearch_metrics.tests.util import djelme_test_backends from faker import Factory import pytest import responses import xml.etree.ElementTree as ET +from waffle.testutils import override_switch from api_tests.share import _utils as shtrove_test_utils from framework.celery_tasks import app as celery_app from osf.external.spam import tasks as spam_tasks from website import settings as website_settings from osf.management.commands.populate_notification_types import populate_notification_types +from osf import features + def pytest_configure(config): if not os.getenv('GITHUB_ACTIONS') == 'true': @@ -43,6 +43,8 @@ def pytest_configure(config): 'transitions.core', 'MARKDOWN', 'elasticsearch', + 'elastic_transport', + 'elasticsearch_metrics', ] for logger_name in SILENT_LOGGERS: logging.getLogger(logger_name).setLevel(logging.CRITICAL) @@ -138,45 +140,20 @@ def es6_client(setup_connections): @pytest.fixture(scope='function', autouse=True) -def _es_metrics_marker(request, worker_id): +def _es_metrics_marker(request): """Clear out all indices and index templates before and after tests marked with `es_metrics`. """ marker = request.node.get_closest_marker('es_metrics') - if marker: - es6_client = request.getfixturevalue('es6_client') - _temp_prefix = 'temp_metrics_' - _temp_wildcard = f'{_temp_prefix}-{worker_id}*' - - def _teardown_es_temps(): - es6_client.indices.delete(index=_temp_wildcard) - try: - es6_client.indices.delete_template(_temp_wildcard) - except es_exceptions.NotFoundError: - pass - - @contextlib.contextmanager - def _mock_metric_names(): - with contextlib.ExitStack() as _exit: - for _metric_class in es_metrics_registry.get_metrics(): - _exit.enter_context(mock.patch.object( - _metric_class, - '_template_name', # also used to construct index names - f'{_temp_prefix}-{worker_id}{_metric_class._template_name}', - )) - _exit.enter_context(mock.patch.object( - _metric_class, - '_template', # a wildcard string for indexes and templates - f'{_temp_prefix}-{worker_id}{_metric_class._template}', - )) - yield - - _teardown_es_temps() - with _mock_metric_names(): - call_command('sync_metrics') - yield - _teardown_es_temps() - else: + + if not marker: + yield + return + + with ( + override_switch(features.ELASTICSEARCH_METRICS, active=True), + djelme_test_backends(), + ): yield diff --git a/docker-compose.yml b/docker-compose.yml index 9914c24728b..42f7efc5ce7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,6 +13,8 @@ volumes: external: false elasticsearch6_data_vol: external: false + elasticsearch8_data_vol: + external: false rabbitmq_vol: external: false preprints_dist_vol: @@ -68,10 +70,35 @@ services: # Temporary: Remove when we've upgraded to ES6 elasticsearch6: image: docker.elastic.co/elasticsearch/elasticsearch:6.3.1 + environment: + - ES_JAVA_OPTS=-Xms512m -Xmx512m # reduce memory usage ports: - 9201:9200 volumes: - elasticsearch6_data_vol:/usr/share/elasticsearch/data + healthcheck: + start_period: 15s + test: curl -s http://localhost:9200/_cluster/health | grep -vq '"status":"red"' + interval: 10s + retries: 30 + stdin_open: true + + elasticsearch8: + image: elasticsearch:8.19.14 + environment: + - discovery.type=single-node + - xpack.security.enabled=false + - ES_JAVA_OPTS=-Xms512m -Xmx512m # reduce memory usage + - xpack.ml.enabled=false + ports: + - 9202:9200 + volumes: + - elasticsearch8_data_vol:/usr/share/elasticsearch/data + healthcheck: + start_period: 15s + test: curl -s http://localhost:9200/_cluster/health | grep -vq '"status":"red"' + interval: 10s + retries: 30 stdin_open: true postgres: diff --git a/framework/celery_tasks/routers.py b/framework/celery_tasks/routers.py index c33238780e8..d9d6e335286 100644 --- a/framework/celery_tasks/routers.py +++ b/framework/celery_tasks/routers.py @@ -11,6 +11,8 @@ def match_by_module(task_path): return CeleryConfig.task_med_queue if task_subpath in CeleryConfig.high_pri_modules: return CeleryConfig.task_high_queue + if task_subpath in CeleryConfig.background_migration_modules: + return CeleryConfig.task_background_migration_queue if task_subpath in CeleryConfig.remote_computing_modules: return CeleryConfig.task_remote_computing_queue if task_subpath in CeleryConfig.account_status_changes_modules: diff --git a/osf/management/commands/check_deploy_ready.py b/osf/management/commands/check_deploy_ready.py index 39fc98e9bee..527c900f273 100644 --- a/osf/management/commands/check_deploy_ready.py +++ b/osf/management/commands/check_deploy_ready.py @@ -18,6 +18,6 @@ def handle(self, *args, **options): ] if waffle.switch_is_active(features.ELASTICSEARCH_METRICS): - CHECKS.append(['check_metrics']) + CHECKS.append(['djelme_backend_check']) for check in CHECKS: call_command(*check) diff --git a/osf/management/commands/fake_metrics_reports.py b/osf/management/commands/fake_metrics_reports.py index 765d6e475c1..53e13472e74 100644 --- a/osf/management/commands/fake_metrics_reports.py +++ b/osf/management/commands/fake_metrics_reports.py @@ -8,6 +8,8 @@ UserSummaryReport, PreprintSummaryReport, ) +from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth from osf.models import PreprintProvider @@ -53,10 +55,27 @@ def fake_preprint_counts(days_back): ).save() +def fake_usage_reports(osfid: str, count: int): + _ym = YearMonth.from_date(date.today()).prior() + for _months in range(count): + PublicItemUsageReport.record( + item_osfid=osfid, + report_yearmonth=_ym, + view_count=(_vc := randint(0, 500)), + view_session_count=randint(0, _vc), + download_count=(_dc := randint(0, 300)), + download_session_count=randint(0, _dc), + ) + _ym = _ym.prior() + + class Command(BaseCommand): def handle(self, *args, **kwargs): if not settings.DEBUG: raise NotImplementedError('fake_reports requires DEBUG mode') fake_user_counts(1000) fake_preprint_counts(1000) + fake_usage_reports('blarg', 100) + fake_usage_reports('blerg', 50) + fake_usage_reports('bleg', 50) # TODO: more reports diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py new file mode 100644 index 00000000000..0c71d7b2307 --- /dev/null +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -0,0 +1,803 @@ +import collections +import datetime +import functools +import logging + +from django.apps import apps +from django.core.management import call_command +from django.core.management.base import BaseCommand +from django.db import OperationalError as DjangoOperationalError +from elasticsearch6.exceptions import ConnectionError as Elastic6ConnectionError +from elasticsearch6 import helpers as es6_helpers +from elasticsearch6_dsl.connections import connections as es6_connections +from elasticsearch8.exceptions import TransportError as Elastic8TransportError +from elasticsearch8.helpers import BulkIndexError as Elastic8BulkIndexError +from elasticsearch_metrics.registry import djelme_registry +from elasticsearch_metrics.imps import elastic8 as djel8me +from psycopg2 import OperationalError as PostgresOperationalError + +from framework.celery_tasks import app as celery_app +from osf.metadata.rdfutils import OSF +from osf.metadata.osfmap_utils import osfmap_type_from_model, osf_iri, is_osf_component +from osf.metrics.preprint_metrics import ( + PreprintView, + PreprintDownload, +) +from osf.metrics.counted_usage import CountedAuthUsage as CountedUsageEs6 +from osf.metrics import reports as es6_reports +from osf.metrics import es8_metrics, RegistriesModerationMetrics +from osf.metrics.reporters.public_item_usage import _iter_composite_bucket_keys +from osf.metrics.utils import YearMonth +from osf import models as osfdb +from website import settings as website_settings + + +_logger = logging.getLogger(__name__) + +### +# constants + +_USAGE_DAYS_BACK = 99 + +_MAX_CARDINALITY_PRECISION = 40000 # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html#_precision_control + +_UNCHANGED_RECORDTYPES = { + # reports + es6_reports.StorageAddonUsage: es8_metrics.DailyStorageAddonUsageReportEs8, + es6_reports.DownloadCountReport: es8_metrics.DailyDownloadCountReportEs8, + es6_reports.InstitutionSummaryReport: es8_metrics.DailyInstitutionSummaryReportEs8, + es6_reports.NewUserDomainReport: es8_metrics.DailyNewUserDomainReportEs8, + es6_reports.NodeSummaryReport: es8_metrics.DailyNodeSummaryReportEs8, + es6_reports.OsfstorageFileCountReport: es8_metrics.DailyOsfstorageFileCountReportEs8, + es6_reports.PreprintSummaryReport: es8_metrics.DailyPreprintSummaryReportEs8, + es6_reports.UserSummaryReport: es8_metrics.DailyUserSummaryReportEs8, + es6_reports.SpamSummaryReport: es8_metrics.MonthlySpamSummaryReportEs8, + es6_reports.InstitutionalUserReport: es8_metrics.MonthlyInstitutionalUserReportEs8, + es6_reports.InstitutionMonthlySummaryReport: es8_metrics.MonthlyInstitutionSummaryReportEs8, + es6_reports.PrivateSpamMetricsReport: es8_metrics.MonthlyPrivateSpamMetricsReportEs8, + # events + RegistriesModerationMetrics: es8_metrics.RegistriesModerationEventEs8, +} + +_TASK_KWARGS = dict( + autoretry_for=( + DjangoOperationalError, + Elastic6ConnectionError, + Elastic8TransportError, + PostgresOperationalError, + ), + retry_backoff=True, # exponential backoff, with jitter + max_retries=20, +) + +### +# celery tasks + + +@celery_app.task(**_TASK_KWARGS) +def migrate_unchanged_recordtype(es6_recordtype_name: str, until_when: str): + _es6_recordtype = djelme_registry.get_recordtype('osf', es6_recordtype_name) + _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] + _convert_kwargs = ( + _convert_unchanged_cyclicrecord_kwargs + if issubclass(_es8_recordtype, djel8me.CyclicRecord) + else (lambda _kw: _kw) # no conversion needed for event record + ) + _each_new = ( + _es8_recordtype(**_convert_kwargs(_hit['_source'])) + for _hit in _es6_scan_range(_es6_recordtype, until_when=until_when) + ) + _es8_bulk_save(_es8_recordtype, _each_new) + + +@celery_app.task(**_TASK_KWARGS) +def migrate_counted_usages(from_when: str, until_when: str): + # CountedAuthUsage => OsfCountedUsageEvent + _each_new = ( + _convert_counted_usage(_hit['_source']) + for _hit in _es6_scan_range( + CountedUsageEs6, + from_when=from_when, + until_when=until_when, + addl_filter={'exists': {'field': 'item_guid'}}, + ) + ) + _es8_bulk_save(es8_metrics.OsfCountedUsageEvent, _each_new) + + +@celery_app.task(**_TASK_KWARGS) +def migrate_preprint_views(from_when: str, until_when: str): + # PreprintView => OsfCountedUsageEvent + _action_labels = ['view', 'web'] + _each_new = ( + _convert_preprint_metric(_hit, _action_labels) + for _hit in _es6_scan_range( + PreprintView, from_when=from_when, until_when=until_when + ) + ) + _es8_bulk_save(es8_metrics.OsfCountedUsageEvent, _each_new) + + +@celery_app.task(**_TASK_KWARGS) +def migrate_preprint_downloads(from_when: str, until_when: str): + # PreprintDownload => OsfCountedUsageEvent + _action_labels = ['download'] + _each_new = ( + _convert_preprint_metric(_hit, _action_labels) + for _hit in _es6_scan_range( + PreprintDownload, from_when=from_when, until_when=until_when + ) + ) + _es8_bulk_save(es8_metrics.OsfCountedUsageEvent, _each_new) + + +@celery_app.task(**_TASK_KWARGS) +def migrate_usage_reports(osfid: str, until_when: str): + # from PublicItemUsageReport to MonthlyPublicItemUsageReportEs8 + _osfguid = osfdb.Guid.load(osfid) + _item_is_component = is_osf_component(_osfguid.referent) if _osfguid else False + + def _each_new(): + # go in sorted order to build cumulative counts + # (only a few dozen of these per item; should be fine to sort and load all at once) + _each_hit = _es6_scan_range( + es6_reports.PublicItemUsageReport, + until_when=until_when, + addl_filter={'term': {'item_osfid': osfid}}, + sort='report_yearmonth', + ) + _prior_report = None + for _hit in list(_each_hit): + yield ( + _prior_report := _convert_public_usage_report( + _hit['_source'], + _prior_report, + item_is_component=_item_is_component, + ) + ) + + _es8_bulk_save(es8_metrics.MonthlyPublicItemUsageReportEs8, _each_new()) + + +### +# various helper functions + + +def _es6_connection(): + return es6_connections.get_connection('osfmetrics_es6') + + +def _es8_bulk_save(es8_recordtype, each_new_record): + try: + es8_recordtype.bulk(each_new_record, stats_only=True) + except Elastic8BulkIndexError as _bulk_error: + # so actual errors show in celery task result + raise Exception(_bulk_error.errors) from _bulk_error + + +def _date_range( + range_start: datetime.date, + range_end: datetime.date, + step: datetime.timedelta = datetime.timedelta(days=1), +) -> collections.abc.Iterator[tuple[datetime.date, datetime.date]]: + _from_date = range_start + _until_date = range_start + step + while _from_date < range_end: + yield (_from_date, _until_date) + (_from_date, _until_date) = (_until_date, _until_date + step) + + +def _es6_scan_range( + es6_recordtype, + *, + from_when: str = '', + until_when: str, + addl_filter=None, + sort=None, +): + _timestamp_range = {'lt': until_when} + if from_when: + _timestamp_range['gte'] = from_when + _filters = [ + {'range': {'timestamp': _timestamp_range}}, + ] + if addl_filter: + _filters.append(addl_filter) + _query_body = {'query': {'bool': {'filter': _filters}}} + if sort: + _query_body['sort'] = sort + return es6_helpers.scan( + _es6_connection(), + index=es6_recordtype._template_pattern, + query=_query_body, + ) + + +def _es6_usage_report_counts() -> tuple[int, int]: + _search = es6_reports.PublicItemUsageReport.search() + _search.aggs.metric( + 'agg_item_count', + 'cardinality', + field='item_osfid', + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _total_count = _response.hits.total + _item_count = ( + _response.aggregations.agg_item_count.value + if 'agg_item_count' in _response.aggregations + else 0 + ) + return (_total_count, _item_count) + + +def _es8_usage_report_counts() -> tuple[int, int]: + _search = es8_metrics.MonthlyPublicItemUsageReportEs8.search() + _search.aggs.metric( + 'agg_item_count', + 'cardinality', + field='item_osfid', + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _total_count = _response.hits.total.value + _item_count = ( + _response.aggregations.agg_item_count.value + if 'agg_item_count' in _response.aggregations + else 0 + ) + return (_total_count, _item_count) + + +def _get_es6_field_names(es6_recordtype): + ''' + adapted from DocumentBase._get_field_names in elasticsearch8.dsl + ''' + for _field_name in es6_recordtype._doc_type.mapping: + _field = es6_recordtype._doc_type.mapping[_field_name] + if hasattr(_field, '_doc_class'): + for _sub_field in _get_es6_field_names(_field._doc_class): + yield f'{_field_name}.{_sub_field}' + else: + yield _field_name + + +def _assert_field_unchangedness(es6_recordtype, es8_recordtype): + _es6_fields = set(_get_es6_field_names(es6_recordtype)) + _es8_fields = set(es8_recordtype._get_field_names()) + + # remove fields intentionally removed in migration + if issubclass(es6_recordtype, es6_reports.DailyReport): + assert issubclass(es8_recordtype, djel8me.CyclicRecord) + _es6_fields.remove('timestamp') + _es6_fields.remove('report_date') + elif issubclass(es6_recordtype, es6_reports.MonthlyReport): + assert issubclass(es8_recordtype, djel8me.CyclicRecord) + _es6_fields.remove('timestamp') + _es6_fields.remove('report_yearmonth') + else: + assert issubclass(es8_recordtype, djel8me.EventRecord) + + # remove fields intentionally added in migration + _es8_fields.remove('timeseries_timeparts') + if issubclass(es8_recordtype, djel8me.CyclicRecord): + _es8_fields.remove('created') + _es8_fields.remove('cycle_coverage') + + # all remaining fields should match + assert _es6_fields == _es8_fields + + +def _semverish_from_yearmonth(given_yearmonth: str): + _ym = YearMonth.from_str(given_yearmonth) + return f'{_ym.year}.{_ym.month}' + + +def _semverish_from_date(given_date: str): + _d = datetime.date.fromisoformat(given_date) + return f'{_d.year}.{_d.month}.{_d.day}' + + +def _convert_unchanged_cyclicrecord_kwargs(es6_source: dict) -> dict: + def _each_kwarg(): + for _key, _val in es6_source.items(): + if _key == 'report_yearmonth': + # report_yearmonth converts to cycle_coverage Y.M + yield ('cycle_coverage', _semverish_from_yearmonth(_val)) + elif _key == 'report_date': + # report_date converts to cycle_coverage Y.M.D + yield ('cycle_coverage', _semverish_from_date(_val)) + elif _key != 'timestamp': + # skipping timestamp; on daily/monthly reports just copied from yearmonth/date + yield (_key, _val) + + return dict(_each_kwarg()) + + +def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageEvent: + return es8_metrics.OsfCountedUsageEvent( + # fields from djelme.CountedUsageRecord: + timestamp=source['timestamp'], + sessionhour_id=source['session_id'], + platform_iri=source.get('platform_iri') or website_settings.DOMAIN, + database_iri=_convert_database_iri( + provider_id=source.get('provider_id'), + osf_model_name=source.get('item_type'), + ), + within_iris=[ + osf_iri(_within_osfid) + for _within_osfid in source.get('surrounding_guids', ()) + ], + # fields from OsfCountedUsageEvent: + item_osfid=source['item_guid'], + item_type=_convert_item_type( + source.get('item_type'), + has_surrounding_items=bool(source.get('surrounding_guids')), + ), + item_public=source.get('item_public', True), + provider_id=source.get('provider_id', 'osf'), + user_is_authenticated=source.get('user_is_authenticated', False), + action_labels=source.get('action_labels'), + pageview_info=source.get('pageview_info'), + ) + + +def _convert_preprint_metric( + hit: dict, action_labels: list[str] +) -> es8_metrics.OsfCountedUsageEvent: + _source = hit['_source'] + _doc_id = hit['_id'] + return es8_metrics.OsfCountedUsageEvent.record( + using=False, # don't save yet; will save in bulk + # fields used to compute a sessionhour_id: + timestamp=datetime.datetime.fromisoformat(_source['timestamp']), + user_id=_source.get('user_id'), + client_session_id=_doc_id, # unique session per event (best can do) + # fields from djelme.CountedUsageRecord: + platform_iri=website_settings.DOMAIN, + database_iri=_convert_database_iri( + provider_id=_source.get('provider_id'), + osf_model_name='preprint', + ), + # fields from OsfCountedUsageEvent: + item_osfid=_source['preprint_id'], + item_type=OSF.Preprint, + item_public=True, + provider_id=_source.get('provider_id'), + user_is_authenticated=bool(_source.get('user_id')), + action_labels=action_labels, + ) + + +def _convert_public_usage_report( + source: dict, + prior_report: es8_metrics.MonthlyPublicItemUsageReportEs8 | None, + item_is_component: bool, +) -> es8_metrics.MonthlyPublicItemUsageReportEs8: + if prior_report is None: + _c_views, _c_view_sess, _c_downloads, _c_download_sess = _get_cumulative_usage( + osfid=source['item_osfid'], + until_when=YearMonth.from_str(source['report_yearmonth']).month_end(), + is_preprint=(source.get('item_type') == 'preprint'), + ) + else: + _c_views = prior_report.cumulative_view_count + source.get('view_count', 0) + _c_view_sess = prior_report.cumulative_view_session_count + source.get( + 'view_session_count', 0 + ) + _c_downloads = prior_report.cumulative_download_count + source.get( + 'download_count', 0 + ) + _c_download_sess = prior_report.cumulative_download_session_count + source.get( + 'download_session_count', 0 + ) + return es8_metrics.MonthlyPublicItemUsageReportEs8( + cycle_coverage=_semverish_from_yearmonth(source['report_yearmonth']), + item_osfid=source['item_osfid'], + item_type=_convert_item_type( + source.get('item_type'), + has_surrounding_items=item_is_component, + ), + provider_id=source.get('provider_id'), + platform_iri=source.get('platform_iri') or website_settings.DOMAIN, + view_count=source.get('view_count', 0), + view_session_count=source.get('view_session_count', 0), + cumulative_view_count=_c_views, + cumulative_view_session_count=_c_view_sess, + download_count=source.get('download_count', 0), + download_session_count=source.get('download_session_count', 0), + cumulative_download_count=_c_downloads, + cumulative_download_session_count=_c_download_sess, + ) + + +def _get_cumulative_usage(osfid: str, until_when, *, is_preprint: bool): + if is_preprint: + _views = _cumulative_preprint_count(PreprintView, osfid, until_when) + _downloads = _cumulative_preprint_count(PreprintDownload, osfid, until_when) + _view_sess, _download_sess = 0, 0 # no session info on preprints (yet) + else: + _views, _view_sess = _cumulative_countedusage_views(osfid, until_when) + _downloads, _download_sess = _cumulative_countedusage_downloads( + osfid, until_when + ) + return (_views, _view_sess, _downloads, _download_sess) + + +def _cumulative_countedusage_views(osfid: str, until_when: str) -> tuple[int, int]: + '''compute view_session_count separately to avoid double-counting + + (the same session may be represented in both the composite agg on `item_guid` + and that on `surrounding_guids`) + ''' + # copied/adapted from osf.metrics.reporters.public_item_usage + _search = ( + CountedUsageEs6.search() + .filter('term', item_public=True) + .filter('range', timestamp={'lt': until_when}) + .filter('term', action_labels='view') + .filter( + 'bool', + should=[ + {'term': {'item_guid': osfid}}, + {'term': {'surrounding_guids': osfid}}, + ], + minimum_should_match=1, + ) + .extra(size=0) # only aggregations, no hits + ) + _search.aggs.metric( + 'agg_session_count', + 'cardinality', + field='session_id', + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _view_count = _response.hits.total + _view_session_count = ( + _response.aggregations.agg_session_count.value + if 'agg_session_count' in _response.aggregations + else 0 + ) + return (_view_count, _view_session_count) + + +def _cumulative_countedusage_downloads(osfid, until_when) -> tuple[int, int]: + '''aggregate downloads on each osfid (not including components/files)''' + # copied/adapted from osf.metrics.reporters.public_item_usage + _search = ( + CountedUsageEs6.search() + .filter('term', item_public=True) + .filter('range', timestamp={'lt': until_when}) + .filter('term', action_labels='download') + .filter('term', item_guid=osfid) + ) + _search.aggs.metric( + 'agg_session_count', + 'cardinality', + field='session_id', + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _download_count = _response.hits.total + _download_session_count = ( + _response.aggregations.agg_session_count.value + if 'agg_session_count' in _response.aggregations + else 0 + ) + return (_download_count, _download_session_count) + + +def _cumulative_preprint_count(preprint_metric_cls, osfid: str, until_when: str) -> int: + '''aggregate views on each preprint''' + # copied/adapted from osf.metrics.preprint_metrics + _search = ( + preprint_metric_cls.search() + .filter('term', preprint_id=osfid) + .filter('range', timestamp={'lt': until_when}) + .extra(size=0) # no hits; only aggs + ) + _search.aggs.metric('agg_count', 'sum', field='count') + _response = _search.execute() + _view_count = ( + int(_response.aggregations.agg_count.value) + if hasattr(_response.aggregations, 'agg_count') + else 0 + ) + return _view_count + + +def _convert_item_type(osf_model_name: str | list[str] | None, has_surrounding_items: bool): + if isinstance(osf_model_name, list): + osf_model_name = osf_model_name[0] if osf_model_name else None + if osf_model_name: + try: + return osfmap_type_from_model( + apps.get_model('osf', osf_model_name), + is_component=has_surrounding_items, + ) + except LookupError: + pass + return OSF.Object # fine, fallback to abstract type + + +def _convert_database_iri(provider_id: str | None, osf_model_name: str) -> str: + if not provider_id: + return website_settings.DOMAIN # osf is a provider, sure why not + + match osf_model_name: # lower-cased osf.models class names + case 'node' | 'osfuser': # implicit untyped 'osf' provider + return website_settings.DOMAIN + case 'preprint': # match PreprintProvider.get_semantic_iri + return f'{website_settings.DOMAIN}preprints/{provider_id}' + case 'registration': # match RegistrationProvider.get_semantic_iri + return f'{website_settings.DOMAIN}registries/{provider_id}' + case _ if 'file' in osf_model_name: + # file providers are a different thing that don't really have an iri, just an id + return f'urn:files.osf.io:{provider_id}' + case _: # give up gracefully + _logger.error( + f'unknown model {osf_model_name!r} with provider {provider_id!r}' + ) + return f'urn:osf.io:{provider_id}' + + +def _each_usage_report_osfid(until_when, after_osfid=None): + _search = ( + es6_reports.PublicItemUsageReport.search() + .filter('range', timestamp={'lt': until_when}) + .extra(size=0) + ) + _search.aggs.bucket( + 'agg_osfid', + 'composite', + sources=[{'osfid': {'terms': {'field': 'item_osfid'}}}], + size=500, + ) + return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) + + +### +# the command itself + + +class Command(BaseCommand): + def add_arguments(self, parser): + parser.add_argument( + '--no-setup', + action='store_true', + ) + parser.add_argument( + '--no-counts', + action='store_true', + ) + parser.add_argument( + '--clear-state', + action='store_true', + ) + parser.add_argument( + '--clear-es8-data', + action='store_true', + ) + parser.add_argument( + '--start', + action='store_true', + ) + parser.add_argument( + '--unchanged', + action='store_true', + ) + parser.add_argument( + '--usage-events', + action='store_true', + ) + parser.add_argument( + '--usage-reports', + action='store_true', + ) + + @functools.cached_property + def _migration_started_at(self): + return es8_metrics.Elastic6To8State.get_started_at() + + def handle( + self, + *, + no_setup, + no_counts, + clear_state, + clear_es8_data, + start, + unchanged, + usage_events, + usage_reports, + **kwargs, + ): + self._quiet_chatty_loggers() + if not no_setup: + call_command('djelme_backend_setup') + if clear_state: + self._clear_state() + if clear_es8_data: + self._clear_es8_data(unchanged, usage_events, usage_reports) + self._check_started_at(start_now=start) + _default_all = not any((unchanged, usage_events, usage_reports)) + if unchanged or _default_all: + self._handle_unchanged(start=start, no_counts=no_counts) + if usage_events or _default_all: + self._handle_usage_events(start=start, no_counts=no_counts) + if usage_reports or _default_all: + self._handle_usage_reports(start=start, no_counts=no_counts) + if not no_counts: + self.stdout.write('(counts may be approximate)') + + def _handle_unchanged(self, *, start: bool, no_counts: bool): + # for each (unchanged) report/event: + for _es6_cls, _es8_cls in _UNCHANGED_RECORDTYPES.items(): + _assert_field_unchangedness(_es6_cls, _es8_cls) + if not no_counts: + # display counts + _es6_count = _es6_cls.search().count() + _es8_count = _es8_cls.search().count() + self._write_tabbed('es6', _es6_cls, _es6_count) + self._write_tabbed( + 'es8', + _es8_cls, + _es8_count, + style=self._eq_style(_es8_count, _es6_count), + ) + if start: # schedule task + self.stdout.write( + f'starting {_es6_cls.__name__} => {_es8_cls.__name__}' + ) + migrate_unchanged_recordtype.delay( + _es6_cls.__name__, self._migration_started_at.isoformat() + ) + + def _handle_usage_events(self, *, start: bool, no_counts: bool): + # for counted-usage events: + _started = self._migration_started_at or datetime.datetime.now() + _range_start = (_started - datetime.timedelta(days=_USAGE_DAYS_BACK)).date() + _range_end = _started.date() + datetime.timedelta(days=1) + if not no_counts: + # display counts for each view/download event type + _range_q = { + 'range': { + 'timestamp': { + 'gte': _range_start.isoformat(), + 'lt': _range_end.isoformat(), + } + } + } + _es6_usage_count_q = { + 'bool': { + 'filter': [_range_q, {'exists': {'field': 'item_guid'}}], + }, + } + _es6_pview_count = PreprintView.search().filter(_range_q).count() + _es6_pdownload_count = PreprintDownload.search().filter(_range_q).count() + _es6_usage_event_count = CountedUsageEs6.search().filter(_es6_usage_count_q).count() + _es6_count = ( + _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count + ) + _es8_count = es8_metrics.OsfCountedUsageEvent.search().filter(_range_q).count() + self._write_tabbed('es6', PreprintView, _es6_pview_count) + self._write_tabbed('es6', PreprintDownload, _es6_pdownload_count) + self._write_tabbed('es6', CountedUsageEs6, _es6_usage_event_count) + self._write_tabbed( + 'es6', f'(total between {_range_start} and {_range_end})', _es6_count + ) + self._write_tabbed( + 'es8', + es8_metrics.OsfCountedUsageEvent, + _es8_count, + style=self._eq_style(_es8_count, _es6_count), + ) + if start: # schedule (per-day?) tasks (if --start) + self.stdout.write( + f'starting usages => {es8_metrics.OsfCountedUsageEvent.__name__}' + ) + for _from_date, _until_date in _date_range(_range_start, _range_end): + _from_str = _from_date.isoformat() + _until_str = _until_date.isoformat() + migrate_counted_usages.delay(_from_str, _until_str) + migrate_preprint_views.delay(_from_str, _until_str) + migrate_preprint_downloads.delay(_from_str, _until_str) + + def _handle_usage_reports(self, *, start: bool, no_counts: bool): + if not no_counts: + # display counts of reports and distinct items + _es6_count, _es6_item_count = _es6_usage_report_counts() + _es8_count, _es8_item_count = _es8_usage_report_counts() + self._write_tabbed('es6', es6_reports.PublicItemUsageReport, _es6_count) + self._write_tabbed( + 'es8', + es8_metrics.MonthlyPublicItemUsageReportEs8, + _es8_count, + style=self._eq_style(_es8_count, _es6_count), + ) + self._write_tabbed( + 'es6', + es6_reports.PublicItemUsageReport, + 'osfid count:', + _es6_item_count, + ) + self._write_tabbed( + 'es8', + es8_metrics.MonthlyPublicItemUsageReportEs8, + 'osfid count:', + _es8_item_count, + style=self._eq_style(_es8_item_count, _es6_item_count), + ) + # (if --start) schedule task per item (by composite agg on es6 public usage reports) + # each item-task iter thru reports oldest to newest, adding cumulative counts + if start: + self.stdout.write( + f'starting per-item {es6_reports.PublicItemUsageReport.__name__} => {es8_metrics.MonthlyPublicItemUsageReportEs8.__name__}' + ) + for _osfid in _each_usage_report_osfid( + until_when=self._migration_started_at + ): + migrate_usage_reports.delay( + _osfid, self._migration_started_at.isoformat() + ) + + def _check_started_at(self, start_now): + _started_at = self._migration_started_at + if _started_at: + self.stdout.write( + f'osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}' + ) + elif start_now: + _started_at = es8_metrics.Elastic6To8State.set_started_at_now() + del self._migration_started_at # clear cache + self.stdout.write( + f'osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}' + ) + else: + self.stdout.write( + 'osf.metrics 6->8 migration not started nor starting (run with `--start` to start)' + ) + + def _clear_state(self): + self.stdout.write( + 'clearing all migration state (start time, etc)', self.style.NOTICE + ) + es8_metrics.Elastic6To8State.search().query({'match_all': {}}).delete() + es8_metrics.Elastic6To8State.refresh() + + def _clear_es8_data(self, unchanged, usage_events, usage_reports): + _default_all = not any((unchanged, usage_events, usage_reports)) + _to_clear = [] + if _default_all or unchanged: + _to_clear.extend(_UNCHANGED_RECORDTYPES.values()) + if _default_all or usage_events: + _to_clear.append(es8_metrics.MonthlyPublicItemUsageReportEs8) + if _default_all or usage_reports: + _to_clear.append(es8_metrics.OsfCountedUsageEvent) + for _es8_recordtype in _to_clear: + self.stdout.write( + f'clearing {_es8_recordtype.__name__}', self.style.NOTICE + ) + _es8_recordtype.search().query({'match_all': {}}).delete() + _es8_recordtype.refresh() + + def _eq_style(self, num: int, should_be: int): + return self.style.SUCCESS if (num == should_be) else self.style.WARNING + + def _write_tabbed(self, *strables, style=None): + def _to_str(strable): + if isinstance(strable, type): + return strable.__name__ + return str(strable) + + self.stdout.write('\t'.join(map(_to_str, strables)), style) + + def _quiet_chatty_loggers(self): + _chatty_loggers = [ + 'elasticsearch', + 'elastic_transport', + 'elasticsearch_metrics', + ] + for logger_name in _chatty_loggers: + logging.getLogger(logger_name).setLevel(logging.ERROR) diff --git a/osf/management/commands/monthly_reporters_go.py b/osf/management/commands/monthly_reporters_go.py index 83ed5f6d985..c5dd4034777 100644 --- a/osf/management/commands/monthly_reporters_go.py +++ b/osf/management/commands/monthly_reporters_go.py @@ -3,12 +3,14 @@ from django.core.management.base import BaseCommand from django.db import OperationalError as DjangoOperationalError -from elasticsearch.exceptions import ConnectionError as ElasticConnectionError +from elasticsearch6.exceptions import ConnectionError as Elastic6ConnectionError +from elasticsearch8.exceptions import ConnectionError as Elastic8ConnectionError from psycopg2 import OperationalError as PostgresOperationalError from framework.celery_tasks import app as celery_app import framework.sentry from osf.metrics.reporters import AllMonthlyReporters +from osf.metrics.reports import MonthlyReport from osf.metrics.utils import YearMonth @@ -17,7 +19,8 @@ _CONTINUE_AFTER_ERRORS = ( DjangoOperationalError, - ElasticConnectionError, + Elastic6ConnectionError, + Elastic8ConnectionError, PostgresOperationalError, ) @@ -70,11 +73,7 @@ def schedule_monthly_reporter( @celery_app.task( name='management.commands.monthly_reporter_do', - autoretry_for=( - DjangoOperationalError, - ElasticConnectionError, - PostgresOperationalError, - ), + autoretry_for=_CONTINUE_AFTER_ERRORS, max_retries=5, retry_backoff=True, ) @@ -85,9 +84,10 @@ def monthly_reporter_do(reporter_key: str, yearmonth: str, report_kwargs: dict): framework.sentry.log_exception(exc) return - _report = _reporter.report(**report_kwargs) - if _report is not None: - _report.report_yearmonth = _reporter.yearmonth + _reports = _reporter.report(**report_kwargs) + for _report in _reports: + if isinstance(_report, MonthlyReport) and (_report.report_yearmonth is None): + _report.report_yearmonth = _reporter.yearmonth _report.save() _followup_task = _reporter.followup_task(_report) if _followup_task is not None: diff --git a/osf/management/commands/osf_shell.py b/osf/management/commands/osf_shell.py index 851895623ac..69443d004be 100644 --- a/osf/management/commands/osf_shell.py +++ b/osf/management/commands/osf_shell.py @@ -32,7 +32,7 @@ def get_user_imports(): from django.db.models import Model from django_extensions.management.commands import shell_plus from django_extensions.management.utils import signalcommand -from elasticsearch_metrics.registry import registry as metrics_registry +from elasticsearch_metrics.registry import djelme_registry def header(text): @@ -160,7 +160,7 @@ def get_osf_imports(self): def get_metrics(self): return { each.__name__: each - for each in metrics_registry.get_metrics() + for each in djelme_registry.each_recordtype() } def get_grouped_imports(self, options): diff --git a/osf/management/commands/reindex_es6.py b/osf/management/commands/reindex_es6.py index c37d0e34f2c..8961ea6fff1 100644 --- a/osf/management/commands/reindex_es6.py +++ b/osf/management/commands/reindex_es6.py @@ -4,7 +4,7 @@ import logging from django.core.management.base import BaseCommand -from elasticsearch_dsl import connections +from elasticsearch6_dsl import connections from elasticsearch_metrics.registry import registry logger = logging.getLogger(__name__) diff --git a/osf/management/commands/sync_databases.py b/osf/management/commands/sync_databases.py index c31d63ea16e..b5030b4bba7 100644 --- a/osf/management/commands/sync_databases.py +++ b/osf/management/commands/sync_databases.py @@ -20,7 +20,7 @@ def handle(self, *args, **options): ['migrate'], ] if waffle.switch_is_active(features.ELASTICSEARCH_METRICS): - COMMANDS.append(['sync_metrics']) + COMMANDS.append(['djelme_backend_setup']) for check in COMMANDS: call_command(*check) diff --git a/osf/metadata/osf_gathering.py b/osf/metadata/osf_gathering.py index 240e5c96561..74c20131464 100644 --- a/osf/metadata/osf_gathering.py +++ b/osf/metadata/osf_gathering.py @@ -13,6 +13,12 @@ from osf import models as osfdb from osf.metadata import gather from osf.metadata.definitions.datacite import DATACITE_RESOURCE_TYPES_GENERAL +from osf.metadata.osfmap_utils import ( + osfmap_type, + osf_iri, + is_osf_component, + osfid_from_iri, +) from osf.metadata.rdfutils import ( DATACITE, DCAT, @@ -30,7 +36,6 @@ SKOS, checksum_iri, format_dcterms_extent, - without_namespace, smells_like_iri, ) from osf.metrics.reports import PublicItemUsageReport @@ -319,15 +324,13 @@ def get_expiration_date(self, basket: gather.Basket) -> datetime.date | None: ##### END osfmap ##### -##### BEGIN osf-specific utils ##### - class OsfFocus(gather.Focus): def __init__(self, osf_item): if isinstance(osf_item, str): osf_item = osfdb.base.coerce_guid(osf_item).referent super().__init__( iri=osf_iri(osf_item), - rdftype=get_rdf_type(osf_item), + rdftype=osfmap_type(osf_item), provider_id=osf_item.provider._id if (osf_item and getattr(osf_item, 'type', '') == 'osf.registration' and osf_item.provider) else None ) self.dbmodel = osf_item @@ -337,54 +340,6 @@ def __init__(self, osf_item): pass # is ok for a focus to be something non-osfguidy -def is_root(osf_node): - return (osf_node.root_id == osf_node.id) - - -def get_rdf_type(osfguid_referent): - if isinstance(osfguid_referent, osfdb.Guid): - osfguid_referent = osfguid_referent.referent - - if isinstance(osfguid_referent, osfdb.OSFUser): - return DCTERMS.Agent - if isinstance(osfguid_referent, osfdb.BaseFileNode): - return OSF.File - if isinstance(osfguid_referent, osfdb.Preprint): - return OSF.Preprint - if isinstance(osfguid_referent, osfdb.Registration): - return ( - OSF.Registration - if is_root(osfguid_referent) - else OSF.RegistrationComponent - ) - if isinstance(osfguid_referent, osfdb.Node): - return ( - OSF.Project - if is_root(osfguid_referent) - else OSF.ProjectComponent - ) - raise NotImplementedError - - -def osf_iri(guid_or_model): - """return a rdflib.URIRef or None - - @param guid_or_model: a string, Guid instance, or another osf model instance - @returns rdflib.URIRef or None - """ - guid = osfdb.base.coerce_guid(guid_or_model) - return OSFIO[guid._id] - - -def osfguid_from_iri(iri: str) -> str: - if iri.startswith(OSFIO): - return without_namespace(iri, OSFIO) - raise ValueError(f'expected iri starting with "{OSFIO}" (got "{iri}")') - - -##### END osf-specific utils ##### - - ##### BEGIN the gatherers ##### # @@ -720,7 +675,7 @@ def gather_file_mediatype(focus): @gather.er(DCTERMS.hasPart, DCTERMS.isPartOf) def gather_parts(focus): if isinstance(focus.dbmodel, osfdb.AbstractNode): - if not is_root(focus.dbmodel) and focus.dbmodel.root.is_public: + if is_osf_component(focus.dbmodel) and focus.dbmodel.root.is_public: root_focus = OsfFocus(focus.dbmodel.root) yield (OSF.hasRoot, root_focus) child_relations = ( @@ -1132,7 +1087,7 @@ def gather_cedar_templates(focus): @gather.er(OSF.usage) def gather_last_month_usage(focus): _usage_report = PublicItemUsageReport.for_last_month( - item_osfid=osfguid_from_iri(focus.iri), + item_osfid=osfid_from_iri(focus.iri), ) if _usage_report is not None: _usage_report_ref = rdflib.BNode() diff --git a/osf/metadata/osfmap_utils.py b/osf/metadata/osfmap_utils.py new file mode 100644 index 00000000000..031cd160eac --- /dev/null +++ b/osf/metadata/osfmap_utils.py @@ -0,0 +1,69 @@ +from osf.metadata.rdfutils import ( + DCTERMS, + OSF, + OSFIO, + without_namespace, +) +from osf import models as osfdb + + +def is_osf_component(osf_node) -> bool: + return ( + isinstance(osf_node, osfdb.AbstractNode) + and osf_node.root_id != osf_node.id + ) + + +def osfmap_type_from_model(model_cls, *, is_component=None): + if issubclass(model_cls, osfdb.OSFUser): + return DCTERMS.Agent + if issubclass(model_cls, osfdb.BaseFileNode): + return OSF.File + if issubclass(model_cls, osfdb.Preprint): + return OSF.Preprint + if issubclass(model_cls, osfdb.Registration): + if is_component is None: + raise ValueError(f'osfmap_type_from_model requires `is_component` for {model_cls}') + return ( + OSF.RegistrationComponent + if is_component + else OSF.Registration + ) + if issubclass(model_cls, osfdb.Node): + if is_component is None: + raise ValueError(f'osfmap_type_from_model requires `is_component` for {model_cls}') + return ( + OSF.ProjectComponent + if is_component + else OSF.Project + ) + raise LookupError(model_cls) + + +def osfmap_type(osf_obj): + if isinstance(osf_obj, osfdb.Guid): + osf_obj = osf_obj.referent + return osfmap_type_from_model(type(osf_obj), is_component=is_osf_component(osf_obj)) + + +def osf_iri(guid_or_model): + """return a rdflib.URIRef or None + + @param guid_or_model: a string, Guid instance, or another osf model instance + @returns rdflib.URIRef or None + """ + _osfid: str = ( + guid_or_model + if isinstance(guid_or_model, str) + else osfdb.base.coerce_guid(guid_or_model)._id + ) + return OSFIO[_osfid] + + +def osfid_from_iri(iri: str) -> str: + if not iri.startswith(OSFIO): + raise ValueError(f'expected iri starting with "{OSFIO}" (got {iri!r})') + _osfid = without_namespace(iri, OSFIO) + if not _osfid or '/' in _osfid: + raise ValueError(f'expected iri path with exactly one segment (got {_osfid!r} from {iri!r})') + return _osfid diff --git a/osf/metadata/serializers/linkset.py b/osf/metadata/serializers/linkset.py index f83dad00ebd..3ee907d0532 100644 --- a/osf/metadata/serializers/linkset.py +++ b/osf/metadata/serializers/linkset.py @@ -16,7 +16,7 @@ import rdflib from ._base import MetadataSerializer -from osf.metadata.osf_gathering import osfguid_from_iri +from osf.metadata.osf_gathering import osfid_from_iri from osf.metadata.rdfutils import (DOI, DATACITE, DCTERMS, OWL, RDF, OSF, DCAT, SCHEMA, DATACITE_SCHEMA_RESOURCE_TYPE_GENERAL_MAPPING, map_resource_type_general_datacite_to_scheme) from website.settings import DOMAIN from website.util import web_url_for @@ -74,7 +74,7 @@ def _each_link(self) -> Iterator[SignpostLink]: base_metadata_url = urljoin(DOMAIN, web_url_for( 'metadata_download', # name of a view function mapped in website/routes.py - guid=osfguid_from_iri(self.basket.focus.iri), + guid=osfid_from_iri(self.basket.focus.iri), )) split_base_metadata_url = urlsplit(base_metadata_url) diff --git a/osf/metrics/__init__.py b/osf/metrics/__init__.py index 0e7b1a1cf32..6056e6d92f3 100644 --- a/osf/metrics/__init__.py +++ b/osf/metrics/__init__.py @@ -17,6 +17,8 @@ StorageAddonUsage, UserSummaryReport, ) +from . import es8_metrics + DAILY_REPORTS = ( DownloadCountReport, @@ -36,4 +38,5 @@ 'PreprintView', 'PreprintDownload', 'RegistriesModerationMetrics', + 'es8_metrics', ) diff --git a/osf/metrics/counted_usage.py b/osf/metrics/counted_usage.py index 39b3b74129b..41ea012fda5 100644 --- a/osf/metrics/counted_usage.py +++ b/osf/metrics/counted_usage.py @@ -4,7 +4,7 @@ from urllib.parse import urlsplit from elasticsearch6_dsl import InnerDoc, analyzer, tokenizer -from elasticsearch_metrics import metrics +import elasticsearch_metrics.imps.elastic6 as metrics from elasticsearch_metrics.signals import pre_save from django.dispatch import receiver import pytz diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py new file mode 100644 index 00000000000..1d73009ed5b --- /dev/null +++ b/osf/metrics/es8_metrics.py @@ -0,0 +1,579 @@ +import datetime +import enum +import functools +from urllib.parse import urlsplit + +import elasticsearch8.dsl as esdsl +from elasticsearch_metrics import DAILY, MONTHLY, YEARLY +import elasticsearch_metrics.imps.elastic8 as djelme + +from osf.metadata.osfmap_utils import ( + osfmap_type, + osf_iri, + osfid_from_iri, +) +from osf.metrics.counted_usage import _get_surrounding_guids +from osf.metrics.utils import YearMonth +from osf import models as osfdb +from website import settings as website_settings + + +### +# custom dsl fields + +class YearmonthField(esdsl.Date): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs, format='strict_year_month') + + def deserialize(self, data): + if isinstance(data, int): + # elasticsearch stores dates in milliseconds since the unix epoch + _as_datetime = datetime.datetime.fromtimestamp(data // 1000) + return YearMonth.from_date(_as_datetime) + elif data is None: + return None + try: + return YearMonth.from_any(data) + except ValueError: + raise ValueError(f'unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth') + + def serialize(self, data, skip_empty=True): + if isinstance(data, str): + return data + elif isinstance(data, YearMonth): + return str(data) + elif isinstance(data, (datetime.datetime, datetime.date)): + return str(YearMonth.from_date(data)) + elif data is None: + return None + else: + raise ValueError(f'unsure how to serialize "{data}" (of type {type(data)}) as YYYY-MM') + + +### +# inner objects for events + +route_prefix_analyzer = esdsl.analyzer( + 'route_prefix_analyzer', + tokenizer=esdsl.tokenizer('route_prefix_tokenizer', 'path_hierarchy', delimiter='.'), +) + + +class PageviewInfo(esdsl.InnerDoc): + """PageviewInfo + + for CountedAuthUsage generated by viewing a web page + """ + + # fields that should be provided + referer_url: str | None + page_url: str | None + page_title: str | None + route_name: str | None = esdsl.mapped_field(esdsl.Keyword( + fields={ + 'by_prefix': esdsl.Text(analyzer=route_prefix_analyzer), + }, + )) + + # fields auto-filled + page_path: str | None + referer_domain: str | None + hour_of_day: int | None + + +### +# Event records + +class OsfCountedUsageEvent(djelme.CountedUsageRecord): + ''' + Aim to support a COUNTER-style reporting api + https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html + https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html + ''' + UNIQUE_TOGETHER_FIELDS = ( + 'platform_iri', + 'sessionhour_id', + 'action_labels', + # include some non-field properties for more complex logic to + # slightly better approximate `counter:Double-Click Filtering` + # and allow for multiple pages describing the same item_iri + '_page_url_or_osfid', # non-field property + '_timestamp_date', # non-field property + '_timestamp_30sec_window', # non-field property + ) + + # inherited fields: + # timestamp: datetime.datetime + # platform_iri: str + # database_iri: str + # item_iri: str + # sessionhour_id: str + # within_iris: list[str] + + # osf-specific fields: + item_osfid: str + item_type: str + item_public: bool + provider_id: str | None + user_is_authenticated: bool + action_labels: list[str] + pageview_info: PageviewInfo | None + + class Meta: + timeseries_index_timedepth = MONTHLY + + class ActionLabel(enum.Enum): + SEARCH = 'search' # counter:Search + VIEW = 'view' # counter:Investigation + DOWNLOAD = 'download' # counter:Request + WEB = 'web' # counter:Regular (aka "pageview") + API = 'api' # counter:TDM (aka "non-web api usage") + + @classmethod + def record(cls, **kwargs): + # autofill `user_is_authenticated` before `user_id` discarded (couldn't in `clean`) + if 'user_is_authenticated' not in kwargs: + kwargs['user_is_authenticated'] = bool(kwargs.get('user_id')) + return super().record(**kwargs) + + @property + def _page_url_or_osfid(self): + # for UNIQUE_TOGETHER_FIELDS + return ( + self.pageview_info.page_url + if self.pageview_info is not None and self.pageview_info.page_url is not None + else self.item_osfid + ) + + @property + def _timestamp_date(self): + # for UNIQUE_TOGETHER_FIELDS + return self.timestamp.date() + + @property + def _timestamp_30sec_window(self): + # for UNIQUE_TOGETHER_FIELDS + # slice the day into an array of 30-second windows, + # find this timestamp's windowslice index + _day_start = datetime.datetime( + self.timestamp.year, + self.timestamp.month, + self.timestamp.day, + tzinfo=self.timestamp.tzinfo, + ) + _time_in_seconds = (self.timestamp - _day_start).total_seconds() + return int(_time_in_seconds / 30) # 30-second windows + + @functools.cached_property + def _osfid_referent(self): + # for use by autofill methods, if needed + _osfguid = osfdb.Guid.load(self.item_osfid) + return _osfguid.referent if _osfguid else None + + def clean(self): + self._autofill_platform_iri() + self._autofill_item_iri_and_osfid() + self._autofill_item_public() + self._autofill_item_type() + self._autofill_provider_id() + self._autofill_within_iris() + self._autofill_pageview() + self._autofill_database_iri() + self._clean_action_labels() + super().clean() + + def _autofill_platform_iri(self): + if self.platform_iri is None: + self.platform_iri = website_settings.DOMAIN + + def _autofill_item_iri_and_osfid(self): + if self.item_osfid and not self.item_iri: + self.item_iri = osf_iri(self.item_osfid) + elif self.item_iri and not self.item_osfid: + try: + self.item_osfid = osfid_from_iri(self.item_iri) + except ValueError: + pass + + def _autofill_item_public(self): + if self.item_osfid and (self.item_public is None): + _item = self._osfid_referent + # if it quacks like BaseFileNode, look at .target instead + _item = getattr(_item, 'target', None) or _item + self.item_public = ( + _item.verified_publishable # quacks like Preprint + if hasattr(_item, 'verified_publishable') + else getattr(_item, 'is_public', False) # quacks like AbstractNode + ) + + def _autofill_item_type(self): + if self.item_osfid and not self.item_type: + self.item_type = osfmap_type(self._osfid_referent) + + def _autofill_provider_id(self): + if self.item_osfid and not self.provider_id: + _provider = getattr(self._osfid_referent, 'provider', None) + if _provider is None: + self.provider_id = 'osf' # quacks like Node, Comment, WikiPage + elif isinstance(_provider, str): + self.provider_id = _provider # quacks like BaseFileNode + else: + self.provider_id = _provider._id # quacks like Registration, Preprint, Collection + + def _autofill_within_iris(self): + if self.item_osfid and (self.within_iris is None) and self._osfid_referent: + self.within_iris = [ + osf_iri(_osfid) + for _osfid in _get_surrounding_guids(self._osfid_referent) + ] + # ensure inclusive "within" + if not self.within_iris: + self.within_iris = [self.item_iri] + if self.item_iri not in self.within_iris: + self.within_iris = [self.item_iri, *self.within_iris] + + def _autofill_pageview(self): + # autofill pageview_info fields from other fields + if self.pageview_info: + self.pageview_info.hour_of_day = self.timestamp.hour + _url = self.pageview_info.page_url + if _url: + self.pageview_info.page_path = urlsplit(_url).path.rstrip('/') + _ref_url = self.pageview_info.referer_url + if _ref_url: + self.pageview_info.referer_domain = urlsplit(_ref_url).netloc + + def _autofill_database_iri(self): + if self.item_osfid and not self.database_iri: + _provider = getattr(self._osfid_referent, 'provider', None) + if not _provider: + self.database_iri = website_settings.DOMAIN + elif isinstance(_provider, str): + # file providers are a different thing that don't really have an iri, just an id + self.database_iri = f'urn:files.osf.io:{self.provider_id}' + else: + self.database_iri = _provider.get_semantic_iri() + + def _clean_action_labels(self): + if self.action_labels: + self.action_labels = sorted(self.action_labels) + + +class RegistriesModerationEventEs8(djelme.EventRecord): + UNIQUE_TOGETHER_FIELDS = ( + 'timestamp', 'registration_id', 'trigger', 'from_state', 'to_state', 'user_id' + ) + + registration_id: str + provider_id: str + trigger: str + from_state: str + to_state: str + user_id: str + comment: str | None + + class Meta: + timeseries_recordtype_name = 'RegistriesModerationEvent' + timeseries_index_timedepth = MONTHLY + + +### +# Reusable inner objects for reports + +class RunningTotal(esdsl.InnerDoc): + total: int + total_daily: int | None + + +class FileRunningTotals(esdsl.InnerDoc): + total: int + public: int + private: int + total_daily: int + public_daily: int + private_daily: int + + +class NodeRunningTotals(esdsl.InnerDoc): + total: int + total_excluding_spam: int | None + public: int + private: int + total_daily: int + total_daily_excluding_spam: int | None + public_daily: int + private_daily: int + + +class RegistrationRunningTotals(esdsl.InnerDoc): + total: int + public: int + embargoed: int + embargoed_v2: int + withdrawn: int | None + total_daily: int + public_daily: int + embargoed_daily: int + embargoed_v2_daily: int + withdrawn_daily: int | None + + +class UsageByStorageAddon(esdsl.InnerDoc): + addon_shortname: str + enabled_usersettings: RunningTotal + linked_usersettings: RunningTotal + deleted_usersettings: RunningTotal + usersetting_links: RunningTotal + connected_nodesettings: RunningTotal + disconnected_nodesettings: RunningTotal + deleted_nodesettings: RunningTotal + + +### +# Cyclic reports + + +class DailyStorageAddonUsageReportEs8(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = DAILY + + usage_by_addon: list[UsageByStorageAddon] + + class Meta: + timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'DailyStorageAddonUsageReport' + + +class DailyDownloadCountReportEs8(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = DAILY + + daily_file_downloads: int + + class Meta: + timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'DailyDownloadCountReport' + + +class DailyInstitutionSummaryReportEs8(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = DAILY + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id',) + + institution_id: str + institution_name: str + users: RunningTotal + nodes: NodeRunningTotals + projects: NodeRunningTotals + registered_nodes: RegistrationRunningTotals + registered_projects: RegistrationRunningTotals + + class Meta: + timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'DailyInstitutionSummaryReport' + + +class DailyNewUserDomainReportEs8(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = DAILY + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'domain_name',) + + domain_name: str + new_user_count: int + + class Meta: + timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'DailyNewUserDomainReport' + + +class DailyNodeSummaryReportEs8(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = DAILY + + nodes: NodeRunningTotals + projects: NodeRunningTotals + registered_nodes: RegistrationRunningTotals + registered_projects: RegistrationRunningTotals + + class Meta: + timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'DailyNodeSummaryReport' + + +class DailyOsfstorageFileCountReportEs8(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = DAILY + + files: FileRunningTotals + + class Meta: + timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'DailyOsfstorageFileCountReport' + + +class DailyPreprintSummaryReportEs8(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = DAILY + + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'provider_key',) + provider_key: str + preprint_count: int + + class Meta: + timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'DailyPreprintSummaryReport' + + +class DailyUserSummaryReportEs8(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = DAILY + + active: int + deactivated: int + merged: int + new_users_daily: int + new_users_with_institution_daily: int + unconfirmed: int + + class Meta: + timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'DailyUserSummaryReport' + + +class MonthlySpamSummaryReportEs8(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = MONTHLY + + node_confirmed_spam: int + node_confirmed_ham: int + node_flagged: int + registration_confirmed_spam: int + registration_confirmed_ham: int + registration_flagged: int + preprint_confirmed_spam: int + preprint_confirmed_ham: int + preprint_flagged: int + user_marked_as_spam: int + user_marked_as_ham: int + + class Meta: + timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'MonthlySpamSummaryReport' + + +class MonthlyInstitutionalUserReportEs8(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = MONTHLY + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', 'user_id',) + + institution_id: str + # user info: + user_id: str + user_name: str + department_name: str | None + month_last_login = YearmonthField() + month_last_active = YearmonthField() + account_creation_date = YearmonthField() + orcid_id: str | None + # counts: + public_project_count: int + private_project_count: int + public_registration_count: int + embargoed_registration_count: int + published_preprint_count: int + public_file_count: int = esdsl.mapped_field(esdsl.Long()) + storage_byte_count: int = esdsl.mapped_field(esdsl.Long()) + + class Meta: + timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'MonthlyInstitutionalUserReport' + + +class MonthlyInstitutionSummaryReportEs8(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = MONTHLY + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', ) + + institution_id: str + user_count: int + public_project_count: int + private_project_count: int + public_registration_count: int + embargoed_registration_count: int + published_preprint_count: int + storage_byte_count: int = esdsl.mapped_field(esdsl.Long()) + public_file_count: int = esdsl.mapped_field(esdsl.Long()) + monthly_logged_in_user_count: int = esdsl.mapped_field(esdsl.Long()) + monthly_active_user_count: int = esdsl.mapped_field(esdsl.Long()) + + class Meta: + timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'MonthlyInstitutionSummaryReport' + + +class MonthlyPublicItemUsageReportEs8(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = MONTHLY + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'item_osfid') + + # where noted, fields are meant to correspond to defined terms from COUNTER + # https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html + # https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html + item_osfid: str + item_type: list[str] # counter:Data-Type + provider_id: list[str] # counter:Database(?) + platform_iri: list[str] # counter:Platform + + # view counts include views on components or files contained by this item + view_count: int = esdsl.mapped_field(esdsl.Long()) + view_session_count: int = esdsl.mapped_field(esdsl.Long()) + cumulative_view_count: int = esdsl.mapped_field(esdsl.Long()) + cumulative_view_session_count: int = esdsl.mapped_field(esdsl.Long()) + + # download counts of this item only (not including contained components or files) + download_count: int = esdsl.mapped_field(esdsl.Long()) + download_session_count: int = esdsl.mapped_field(esdsl.Long()) + cumulative_download_count: int = esdsl.mapped_field(esdsl.Long()) + cumulative_download_session_count: int = esdsl.mapped_field(esdsl.Long()) + + class Meta: + timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'MonthlyPublicItemUsageReport' + + +class MonthlyPrivateSpamMetricsReportEs8(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = MONTHLY + + node_oopspam_flagged: int + node_oopspam_hammed: int + node_akismet_flagged: int + node_akismet_hammed: int + preprint_oopspam_flagged: int + preprint_oopspam_hammed: int + preprint_akismet_flagged: int + preprint_akismet_hammed: int + + class Meta: + timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'MonthlyPrivateSpamMetricsReport' + + +### +# data migration state + +class Elastic6To8State(djelme.SimpleRecord): + """index for storing values helpful for keeping track of the elastic 6->8 data migration""" + UNIQUE_TOGETHER_FIELDS = ('key',) + key: str + value: str | None + timestamp: datetime.datetime = esdsl.mapped_field( + default_factory=lambda: datetime.datetime.now(datetime.UTC), + ) + + @classmethod + def get_by_key(cls, key: str): + _response = cls.search().query({'term': {'key': key}})[0].execute() + return _response[0] if _response else None + + @classmethod + def get_timestamp(cls, key: str) -> datetime.datetime | None: + _record = cls.get_by_key(key) + return _record.timestamp if _record else None + + @classmethod + def get_started_at(cls): + return cls.get_timestamp('started_at') + + @classmethod + def set_started_at_now(cls): + _record = cls.record(key='started_at') + cls.refresh() + return _record.timestamp diff --git a/osf/metrics/metric_mixin.py b/osf/metrics/metric_mixin.py index 724ab1958da..df87d5123b1 100644 --- a/osf/metrics/metric_mixin.py +++ b/osf/metrics/metric_mixin.py @@ -2,7 +2,7 @@ from django.db import models from django.utils import timezone -from elasticsearch.exceptions import NotFoundError +from elasticsearch6.exceptions import NotFoundError import pytz diff --git a/osf/metrics/preprint_metrics.py b/osf/metrics/preprint_metrics.py index 9d02ec191a2..d284d80827e 100644 --- a/osf/metrics/preprint_metrics.py +++ b/osf/metrics/preprint_metrics.py @@ -1,5 +1,5 @@ -from elasticsearch.exceptions import NotFoundError -from elasticsearch_metrics import metrics +from elasticsearch6.exceptions import NotFoundError +import elasticsearch_metrics.imps.elastic6 as metrics from .metric_mixin import MetricMixin diff --git a/osf/metrics/registry_metrics.py b/osf/metrics/registry_metrics.py index 475dca28673..9c779fe8c0b 100644 --- a/osf/metrics/registry_metrics.py +++ b/osf/metrics/registry_metrics.py @@ -1,4 +1,4 @@ -from elasticsearch_metrics import metrics +import elasticsearch_metrics.imps.elastic6 as metrics from osf.utils.workflows import RegistrationModerationTriggers, RegistrationModerationStates from .metric_mixin import MetricMixin diff --git a/osf/metrics/reporters/download_count.py b/osf/metrics/reporters/download_count.py index f772722dc31..4350c1440a1 100644 --- a/osf/metrics/reporters/download_count.py +++ b/osf/metrics/reporters/download_count.py @@ -1,14 +1,22 @@ from osf.models import PageCounter from osf.metrics.reports import DownloadCountReport +from osf.metrics.es8_metrics import DailyDownloadCountReportEs8 +from osf.metrics.utils import cycle_coverage_date from ._base import DailyReporter class DownloadCountReporter(DailyReporter): def report(self, date): download_count = int(PageCounter.get_all_downloads_on_date(date) or 0) - return [ - DownloadCountReport( - daily_file_downloads=download_count, - report_date=date, - ), - ] + reports = [] + report_es8 = DailyDownloadCountReportEs8( + cycle_coverage=cycle_coverage_date(date), + daily_file_downloads=download_count, + ) + reports.append(report_es8) + report = DownloadCountReport( + daily_file_downloads=report_es8.daily_file_downloads, + report_date=date, + ) + reports.append(report) + return reports diff --git a/osf/metrics/reporters/institution_summary.py b/osf/metrics/reporters/institution_summary.py index 892e337aec4..1148f2456e5 100644 --- a/osf/metrics/reporters/institution_summary.py +++ b/osf/metrics/reporters/institution_summary.py @@ -9,9 +9,15 @@ RegistrationRunningTotals, ) from osf.models import Institution +from osf.metrics.es8_metrics import ( + DailyInstitutionSummaryReportEs8, + RunningTotal as RunningTotalEs8, + NodeRunningTotals as NodeRunningTotalsEs8, + RegistrationRunningTotals as RegistrationRunningTotalsEs8 +) +from osf.metrics.utils import cycle_coverage_date from ._base import DailyReporter - logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -39,16 +45,15 @@ def report(self, date): created__date__lte=date, type='osf.registration', ) - - report = InstitutionSummaryReport( - report_date=date, + report_es8 = DailyInstitutionSummaryReportEs8( + cycle_coverage=cycle_coverage_date(date), institution_id=institution._id, institution_name=institution.name, - users=RunningTotal( + users=RunningTotalEs8( total=institution.get_institution_users().filter(is_active=True).count(), total_daily=institution.get_institution_users().filter(date_confirmed__date=date).count(), ), - nodes=NodeRunningTotals( + nodes=NodeRunningTotalsEs8( total=node_qs.count(), public=node_qs.filter(public_query).count(), private=node_qs.filter(private_query).count(), @@ -58,7 +63,7 @@ def report(self, date): private_daily=node_qs.filter(private_query & daily_query).count(), ), # Projects use get_roots to remove children - projects=NodeRunningTotals( + projects=NodeRunningTotalsEs8( total=node_qs.get_roots().count(), public=node_qs.filter(public_query).get_roots().count(), private=node_qs.filter(private_query).get_roots().count(), @@ -67,7 +72,7 @@ def report(self, date): public_daily=node_qs.filter(public_query & daily_query).get_roots().count(), private_daily=node_qs.filter(private_query & daily_query).get_roots().count(), ), - registered_nodes=RegistrationRunningTotals( + registered_nodes=RegistrationRunningTotalsEs8( total=registration_qs.count(), public=registration_qs.filter(public_query).count(), embargoed=registration_qs.filter(private_query).count(), @@ -78,7 +83,7 @@ def report(self, date): embargoed_daily=registration_qs.filter(private_query & daily_query).count(), embargoed_v2_daily=registration_qs.filter(private_query & daily_query & embargo_v2_query).count(), ), - registered_projects=RegistrationRunningTotals( + registered_projects=RegistrationRunningTotalsEs8( total=registration_qs.get_roots().count(), public=registration_qs.filter(public_query).get_roots().count(), embargoed=registration_qs.filter(private_query).get_roots().count(), @@ -87,7 +92,60 @@ def report(self, date): total_daily=registration_qs.filter(daily_query).get_roots().count(), public_daily=registration_qs.filter(public_query & daily_query).get_roots().count(), embargoed_daily=registration_qs.filter(private_query & daily_query).get_roots().count(), - embargoed_v2_daily=registration_qs.filter(private_query & daily_query & embargo_v2_query).get_roots().count(), + embargoed_v2_daily=registration_qs.filter( + private_query & daily_query & embargo_v2_query).get_roots().count(), + ), + ) + reports.append(report_es8) + + report = InstitutionSummaryReport( + report_date=date, + institution_id=institution._id, + institution_name=institution.name, + users=RunningTotal( + total=report_es8.users.total, + total_daily=report_es8.users.total_daily, + ), + nodes=NodeRunningTotals( + total=report_es8.nodes.total, + public=report_es8.nodes.public, + private=report_es8.nodes.private, + + total_daily=report_es8.nodes.total_daily, + public_daily=report_es8.nodes.public_daily, + private_daily=report_es8.nodes.private_daily, + ), + # Projects use get_roots to remove children + projects=NodeRunningTotals( + total=report_es8.projects.total, + public=report_es8.projects.public, + private=report_es8.projects.private, + + total_daily=report_es8.projects.total_daily, + public_daily=report_es8.projects.public_daily, + private_daily=report_es8.projects.private_daily, + ), + registered_nodes=RegistrationRunningTotals( + total=report_es8.registered_nodes.total, + public=report_es8.registered_nodes.public, + embargoed=report_es8.registered_nodes.embargoed, + embargoed_v2=report_es8.registered_nodes.embargoed_v2, + + total_daily=report_es8.registered_nodes.total_daily, + public_daily=report_es8.registered_nodes.public_daily, + embargoed_daily=report_es8.registered_nodes.embargoed_daily, + embargoed_v2_daily=report_es8.registered_nodes.embargoed_v2_daily, + ), + registered_projects=RegistrationRunningTotals( + total=report_es8.registered_projects.total, + public=report_es8.registered_projects.public, + embargoed=report_es8.registered_projects.embargoed, + embargoed_v2=report_es8.registered_projects.embargoed_v2, + + total_daily=report_es8.registered_projects.total_daily, + public_daily=report_es8.registered_projects.public_daily, + embargoed_daily=report_es8.registered_projects.embargoed_daily, + embargoed_v2_daily=report_es8.registered_projects.embargoed_v2_daily, ), ) diff --git a/osf/metrics/reporters/institution_summary_monthly.py b/osf/metrics/reporters/institution_summary_monthly.py index 4748860db32..88d8e1fb891 100644 --- a/osf/metrics/reporters/institution_summary_monthly.py +++ b/osf/metrics/reporters/institution_summary_monthly.py @@ -5,9 +5,10 @@ from osf.models.spam import SpamStatus from addons.osfstorage.models import OsfStorageFile from osf.metrics.reports import InstitutionMonthlySummaryReport +from osf.metrics.es8_metrics import MonthlyInstitutionSummaryReportEs8 +from osf.metrics.utils import cycle_coverage_yearmonth from ._base import MonthlyReporter - class InstitutionalSummaryMonthlyReporter(MonthlyReporter): """Generate an InstitutionMonthlySummaryReport for each institution.""" @@ -20,7 +21,8 @@ def iter_report_kwargs(self, continue_after: dict | None = None): def report(self, **report_kwargs): _institution = Institution.objects.get(pk=report_kwargs['institution_pk']) - return self.generate_report(_institution) + reports = self.generate_report(_institution) + return reports def generate_report(self, institution): node_queryset = institution.nodes.filter( @@ -31,8 +33,9 @@ def generate_report(self, institution): ) preprint_queryset = self.get_published_preprints(institution, self.yearmonth) - - return InstitutionMonthlySummaryReport( + reports = [] + report_es8 = MonthlyInstitutionSummaryReportEs8( + cycle_coverage=cycle_coverage_yearmonth(self.yearmonth), institution_id=institution._id, user_count=institution.get_institution_users().count(), private_project_count=self._get_count(node_queryset, 'osf.node', is_public=False), @@ -45,6 +48,23 @@ def generate_report(self, institution): monthly_logged_in_user_count=self.get_monthly_logged_in_user_count(institution, self.yearmonth), monthly_active_user_count=self.get_monthly_active_user_count(institution, self.yearmonth), ) + reports.append(report_es8) + + report = InstitutionMonthlySummaryReport( + institution_id=report_es8.institution_id, + user_count=report_es8.user_count, + private_project_count=report_es8.private_project_count, + public_project_count=report_es8.public_project_count, + public_registration_count=report_es8.public_registration_count, + embargoed_registration_count=report_es8.embargoed_registration_count, + published_preprint_count=report_es8.published_preprint_count, + storage_byte_count=report_es8.storage_byte_count, + public_file_count=report_es8.public_file_count, + monthly_logged_in_user_count=report_es8.monthly_logged_in_user_count, + monthly_active_user_count=report_es8.monthly_active_user_count, + ) + reports.append(report) + return reports def _get_count(self, node_queryset, node_type, is_public): return node_queryset.filter(type=node_type, is_public=is_public, root_id=F('pk')).count() diff --git a/osf/metrics/reporters/institutional_users.py b/osf/metrics/reporters/institutional_users.py index 512472a3d96..31c31f8ff22 100644 --- a/osf/metrics/reporters/institutional_users.py +++ b/osf/metrics/reporters/institutional_users.py @@ -1,4 +1,5 @@ import dataclasses +from typing import List from django.contrib.contenttypes.models import ContentType from django.db.models import Q, F, Sum @@ -7,7 +8,8 @@ from osf.models.spam import SpamStatus from addons.osfstorage.models import OsfStorageFile from osf.metrics.reports import InstitutionalUserReport -from osf.metrics.utils import YearMonth +from osf.metrics.utils import YearMonth, cycle_coverage_yearmonth +from osf.metrics.es8_metrics import MonthlyInstitutionalUserReportEs8 from ._base import MonthlyReporter @@ -38,7 +40,8 @@ def report(self, **report_kwargs): _institution = osfdb.Institution.objects.get(pk=report_kwargs['institution_pk']) _user = osfdb.OSFUser.objects.get(pk=report_kwargs['user_pk']) _helper = _InstiUserReportHelper(_institution, _user, self.yearmonth) - return _helper.report + _report = next(r for r in _helper.reports if isinstance(r, InstitutionalUserReport)) + return _report # helper @@ -47,11 +50,13 @@ class _InstiUserReportHelper: institution: osfdb.Institution user: osfdb.OSFUser yearmonth: YearMonth - report: InstitutionalUserReport = dataclasses.field(init=False) + reports: List[InstitutionalUserReport | MonthlyInstitutionalUserReportEs8] = dataclasses.field(init=False) def __post_init__(self): _affiliation = self.user.get_institution_affiliation(self.institution._id) - self.report = InstitutionalUserReport( + self.reports = [] + report_es8 = MonthlyInstitutionalUserReportEs8( + cycle_coverage=cycle_coverage_yearmonth(self.yearmonth), institution_id=self.institution._id, user_id=self.user._id, user_name=self.user.fullname, @@ -72,6 +77,25 @@ def __post_init__(self): published_preprint_count=self._published_preprint_queryset().count(), storage_byte_count=self._storage_byte_count(), ) + self.reports.append(report_es8) + report = InstitutionalUserReport( + institution_id=report_es8.institution_id, + user_id=report_es8.user_id, + user_name=report_es8.user_name, + department_name=report_es8.department_name, + month_last_login=report_es8.month_last_login, + month_last_active=report_es8.month_last_active, + account_creation_date=report_es8.account_creation_date, + orcid_id=report_es8.orcid_id, + public_project_count=report_es8.public_project_count, + private_project_count=report_es8.private_project_count, + public_registration_count=report_es8.public_registration_count, + embargoed_registration_count=report_es8.embargoed_registration_count, + public_file_count=report_es8.public_file_count, + published_preprint_count=report_es8.published_preprint_count, + storage_byte_count=report_es8.storage_byte_count, + ) + self.reports.append(report) @property def before_datetime(self): diff --git a/osf/metrics/reporters/new_user_domain.py b/osf/metrics/reporters/new_user_domain.py index ec13aad860f..125e02754d7 100644 --- a/osf/metrics/reporters/new_user_domain.py +++ b/osf/metrics/reporters/new_user_domain.py @@ -3,6 +3,8 @@ from osf.models import OSFUser from osf.metrics.reports import NewUserDomainReport +from osf.metrics.es8_metrics import DailyNewUserDomainReportEs8 +from osf.metrics.utils import cycle_coverage_date from ._base import DailyReporter logger = logging.getLogger(__name__) @@ -20,11 +22,19 @@ def report(self, date): email.split('@')[-1] for email in new_user_emails ) - return [ - NewUserDomainReport( - report_date=date, + reports = [] + for domain_name, count in domain_names.items(): + report_es8 = DailyNewUserDomainReportEs8( + cycle_coverage=cycle_coverage_date(date), domain_name=domain_name, new_user_count=count, ) - for domain_name, count in domain_names.items() - ] + reports.append(report_es8) + + report = NewUserDomainReport( + report_date=date, + domain_name=report_es8.domain_name, + new_user_count=report_es8.new_user_count, + ) + reports.append(report) + return reports diff --git a/osf/metrics/reporters/node_count.py b/osf/metrics/reporters/node_count.py index 0a4120ca1f9..23f4c9bb78c 100644 --- a/osf/metrics/reporters/node_count.py +++ b/osf/metrics/reporters/node_count.py @@ -7,9 +7,14 @@ NodeRunningTotals, RegistrationRunningTotals, ) +from osf.metrics.es8_metrics import ( + DailyNodeSummaryReportEs8, + NodeRunningTotals as NodeRunningTotalsEs8, + RegistrationRunningTotals as RegistrationRunningTotalsEs8 +) +from osf.metrics.utils import cycle_coverage_date from ._base import DailyReporter - logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -35,11 +40,11 @@ def report(self, date): embargo_v2_query = Q(root__embargo__end_date__date__gt=date) exclude_spam = ~Q(spam_status__in=[SpamStatus.SPAM, SpamStatus.FLAGGED]) - - report = NodeSummaryReport( - report_date=date, + reports = [] + report_es8 = DailyNodeSummaryReportEs8( + cycle_coverage=cycle_coverage_date(date), # Nodes - the number of projects and components - nodes=NodeRunningTotals( + nodes=NodeRunningTotalsEs8( total=node_qs.count(), total_excluding_spam=node_qs.filter(exclude_spam).count(), public=node_qs.filter(public_query).count(), @@ -50,7 +55,7 @@ def report(self, date): private_daily=node_qs.filter(private_query & created_today_query).count(), ), # Projects - the number of top-level only projects - projects=NodeRunningTotals( + projects=NodeRunningTotalsEs8( total=node_qs.get_roots().count(), total_excluding_spam=node_qs.get_roots().filter(exclude_spam).count(), public=node_qs.filter(public_query).get_roots().count(), @@ -61,7 +66,7 @@ def report(self, date): private_daily=node_qs.filter(private_query & created_today_query).get_roots().count(), ), # Registered Nodes - the number of registered projects and components - registered_nodes=RegistrationRunningTotals( + registered_nodes=RegistrationRunningTotalsEs8( total=registration_qs.count(), public=registration_qs.filter(public_query).count(), embargoed=registration_qs.filter(private_query).count(), @@ -75,7 +80,7 @@ def report(self, date): ), # Registered Projects - the number of registered top level projects - registered_projects=RegistrationRunningTotals( + registered_projects=RegistrationRunningTotalsEs8( total=registration_qs.get_roots().count(), public=registration_qs.filter(public_query).get_roots().count(), embargoed=registration_qs.filter(private_query).get_roots().count(), @@ -88,5 +93,58 @@ def report(self, date): withdrawn_daily=registration_qs.filter(retracted_query & retracted_today_query).get_roots().count(), ), ) + reports.append(report_es8) + report = NodeSummaryReport( + report_date=date, + # Nodes - the number of projects and components + nodes=NodeRunningTotals( + total=report_es8.nodes.total, + total_excluding_spam=report_es8.nodes.total_excluding_spam, + public=report_es8.nodes.public, + private=report_es8.nodes.private, + total_daily=report_es8.nodes.total_daily, + total_daily_excluding_spam=report_es8.nodes.total_daily_excluding_spam, + public_daily=report_es8.nodes.public_daily, + private_daily=report_es8.nodes.private_daily, + ), + # Projects - the number of top-level only projects + projects=NodeRunningTotals( + total=report_es8.projects.total, + total_excluding_spam=report_es8.projects.total_excluding_spam, + public=report_es8.projects.public, + private=report_es8.projects.private, + total_daily=report_es8.projects.total_daily, + total_daily_excluding_spam=report_es8.projects.total_daily_excluding_spam, + public_daily=report_es8.projects.public_daily, + private_daily=report_es8.projects.private_daily, + ), + # Registered Nodes - the number of registered projects and components + registered_nodes=RegistrationRunningTotals( + total=report_es8.registered_nodes.total, + public=report_es8.registered_nodes.public, + embargoed=report_es8.registered_nodes.embargoed, + embargoed_v2=report_es8.registered_nodes.embargoed_v2, + withdrawn=report_es8.registered_nodes.withdrawn, + total_daily=report_es8.registered_nodes.total_daily, + public_daily=report_es8.registered_nodes.public_daily, + embargoed_daily=report_es8.registered_nodes.embargoed_daily, + embargoed_v2_daily=report_es8.registered_nodes.embargoed_v2_daily, + withdrawn_daily=report_es8.registered_nodes.withdrawn_daily, + ), + # Registered Projects - the number of registered top level projects + registered_projects=RegistrationRunningTotals( + total=report_es8.registered_projects.total, + public=report_es8.registered_projects.public, + embargoed=report_es8.registered_projects.embargoed, + embargoed_v2=report_es8.registered_projects.embargoed_v2, + withdrawn=report_es8.registered_projects.withdrawn, + total_daily=report_es8.registered_projects.total_daily, + public_daily=report_es8.registered_projects.public_daily, + embargoed_daily=report_es8.registered_projects.embargoed_daily, + embargoed_v2_daily=report_es8.registered_projects.embargoed_v2_daily, + withdrawn_daily=report_es8.registered_projects.withdrawn_daily, + ), + ) + reports.append(report) - return [report] + return reports diff --git a/osf/metrics/reporters/osfstorage_file_count.py b/osf/metrics/reporters/osfstorage_file_count.py index 2f35e1e81fd..6ddeb89945b 100644 --- a/osf/metrics/reporters/osfstorage_file_count.py +++ b/osf/metrics/reporters/osfstorage_file_count.py @@ -4,9 +4,13 @@ from osf.metrics.reports import OsfstorageFileCountReport, FileRunningTotals from osf.models import AbstractNode, Preprint +from osf.metrics.es8_metrics import ( + DailyOsfstorageFileCountReportEs8, + FileRunningTotals as FileRunningTotalsEs8 +) +from osf.metrics.utils import cycle_coverage_date from ._base import DailyReporter - logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -32,9 +36,11 @@ def report(self, date): daily_query = Q(created__date=date) - report = OsfstorageFileCountReport( - report_date=date, - files=FileRunningTotals( + reports = [] + + report_es8 = DailyOsfstorageFileCountReportEs8( + cycle_coverage=cycle_coverage_date(date), + files=FileRunningTotalsEs8( total=file_qs.count(), public=file_qs.filter(public_query).count(), private=file_qs.filter(private_query).count(), @@ -43,5 +49,19 @@ def report(self, date): private_daily=file_qs.filter(private_query & daily_query).count(), ), ) + reports.append(report_es8) + + report = OsfstorageFileCountReport( + report_date=date, + files=FileRunningTotals( + total=report_es8.files.total, + public=report_es8.files.public, + private=report_es8.files.private, + total_daily=report_es8.files.total_daily, + public_daily=report_es8.files.public_daily, + private_daily=report_es8.files.private_daily, + ), + ) + reports.append(report) - return [report] + return reports diff --git a/osf/metrics/reporters/preprint_count.py b/osf/metrics/reporters/preprint_count.py index 23f68bc7736..85ba639a32f 100644 --- a/osf/metrics/reporters/preprint_count.py +++ b/osf/metrics/reporters/preprint_count.py @@ -3,6 +3,8 @@ from osf.metrics import PreprintSummaryReport from website import settings +from osf.metrics.es8_metrics import DailyPreprintSummaryReportEs8 +from osf.metrics.utils import cycle_coverage_date from ._base import DailyReporter logger = logging.getLogger(__name__) @@ -48,13 +50,20 @@ def report(self, date): for preprint_provider in PreprintProvider.objects.all(): elastic_query = get_elastic_query(date, preprint_provider) resp = requests.post(f'{settings.SHARE_URL}api/v2/search/creativeworks/_search', json=elastic_query).json() - reports.append( - PreprintSummaryReport( - report_date=date, - provider_key=preprint_provider._id, - preprint_count=resp['hits']['total'], - ) + + report_es8 = DailyPreprintSummaryReportEs8( + cycle_coverage=cycle_coverage_date(date), + provider_key=preprint_provider._id, + preprint_count=resp['hits']['total'], + ) + reports.append(report_es8) + + report = PreprintSummaryReport( + report_date=date, + provider_key=report_es8.provider_key, + preprint_count=report_es8.preprint_count, ) + reports.append(report) logger.info('{} Preprints counted for the provider {}'.format(resp['hits']['total'], preprint_provider.name)) return reports diff --git a/osf/metrics/reporters/private_spam_metrics.py b/osf/metrics/reporters/private_spam_metrics.py index 40f259af325..fde545247e6 100644 --- a/osf/metrics/reporters/private_spam_metrics.py +++ b/osf/metrics/reporters/private_spam_metrics.py @@ -1,8 +1,11 @@ from osf.metrics.reports import PrivateSpamMetricsReport from osf.external.oopspam.client import OOPSpamClient from osf.external.askismet.client import AkismetClient +from osf.metrics.es8_metrics import MonthlyPrivateSpamMetricsReportEs8 +from osf.metrics.utils import cycle_coverage_yearmonth from ._base import MonthlyReporter + class PrivateSpamMetricsReporter(MonthlyReporter): report_name = 'Private Spam Metrics' @@ -13,8 +16,10 @@ def report(self): oopspam_client = OOPSpamClient() akismet_client = AkismetClient() - report = PrivateSpamMetricsReport( - report_yearmonth=str(self.yearmonth), + reports = [] + + report_es8 = MonthlyPrivateSpamMetricsReportEs8( + cycle_coverage=cycle_coverage_yearmonth(self.yearmonth), node_oopspam_flagged=oopspam_client.get_flagged_count(target_month, next_month, category='node'), node_oopspam_hammed=oopspam_client.get_hammed_count(target_month, next_month, category='node'), node_akismet_flagged=akismet_client.get_flagged_count(target_month, next_month, category='node'), @@ -24,5 +29,19 @@ def report(self): preprint_akismet_flagged=akismet_client.get_flagged_count(target_month, next_month, category='preprint'), preprint_akismet_hammed=akismet_client.get_hammed_count(target_month, next_month, category='preprint') ) + reports.append(report_es8) + + report = PrivateSpamMetricsReport( + report_yearmonth=str(self.yearmonth), + node_oopspam_flagged=report_es8.node_oopspam_flagged, + node_oopspam_hammed=report_es8.node_oopspam_hammed, + node_akismet_flagged=report_es8.node_akismet_flagged, + node_akismet_hammed=report_es8.node_akismet_hammed, + preprint_oopspam_flagged=report_es8.preprint_oopspam_flagged, + preprint_oopspam_hammed=report_es8.preprint_oopspam_hammed, + preprint_akismet_flagged=report_es8.preprint_akismet_flagged, + preprint_akismet_hammed=report_es8.preprint_akismet_hammed, + ) + reports.append(report) - return report + return reports diff --git a/osf/metrics/reporters/public_item_usage.py b/osf/metrics/reporters/public_item_usage.py index cc401d50bd7..085bac38684 100644 --- a/osf/metrics/reporters/public_item_usage.py +++ b/osf/metrics/reporters/public_item_usage.py @@ -3,8 +3,11 @@ import typing import waffle + +from osf.metrics.es8_metrics import MonthlyPublicItemUsageReportEs8 + if typing.TYPE_CHECKING: - import elasticsearch_dsl as edsl + import elasticsearch6_dsl as edsl import osf.features from osf.metadata.osf_gathering import OsfmapPartition @@ -18,7 +21,7 @@ PreprintView, ) from osf.metrics.reports import PublicItemUsageReport -from osf.metrics.utils import YearMonth +from osf.metrics.utils import YearMonth, cycle_coverage_yearmonth from osf import models as osfdb from website import settings as website_settings from ._base import MonthlyReporter @@ -61,16 +64,17 @@ def report(self, **report_kwargs): if _guid is None or _guid.referent is None: raise _SkipItem _obj = _guid.referent - _report = self._init_report(_obj) - self._fill_report_counts(_report, _obj) - if not any(( - _report.view_count, - _report.view_session_count, - _report.download_count, - _report.download_session_count, - )): - raise _SkipItem - return _report + _reports = self._init_report(_obj) + for _report in _reports: + self._fill_report_counts(_report, _obj) + if not any(( + _report.view_count, + _report.view_session_count, + _report.download_count, + _report.download_session_count, + )): + raise _SkipItem + return _reports except _SkipItem: return None @@ -131,16 +135,27 @@ def _preprintdownload_osfids(self, after_osfid: str | None) -> typing.Iterator[s ) return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) - def _init_report(self, osf_obj) -> PublicItemUsageReport: + def _init_report(self, osf_obj) -> typing.List[PublicItemUsageReport | MonthlyPublicItemUsageReportEs8]: if not _is_item_public(osf_obj): raise _SkipItem - return PublicItemUsageReport( + reports = [] + report_es8 = MonthlyPublicItemUsageReportEs8( + cycle_coverage=cycle_coverage_yearmonth(self.yearmonth), item_osfid=osf_obj._id, item_type=[get_item_type(osf_obj)], provider_id=[get_provider_id(osf_obj)], platform_iri=[website_settings.DOMAIN], + ) + reports.append(report_es8) + report = PublicItemUsageReport( + item_osfid=report_es8.item_osfid, + item_type=report_es8.item_type, + provider_id=report_es8.provider_id, + platform_iri=report_es8.platform_iri, # leave counts null; will be set if there's data ) + reports.append(report) + return reports def _fill_report_counts(self, report, osf_obj): if ( @@ -154,31 +169,43 @@ def _fill_report_counts(self, report, osf_obj): ( report.view_count, report.view_session_count, - ) = self._countedusage_view_counts(osf_obj) + ) = self._countedusage_view_counts(osf_obj, cumulative=False) ( report.download_count, report.download_session_count, - ) = self._countedusage_download_counts(osf_obj) + ) = self._countedusage_download_counts(osf_obj, cumulative=False) + + ( + report.cumulative_view_count, + report.cumulative_view_session_count, + ) = self._countedusage_view_counts(osf_obj, cumulative=True) - def _base_usage_search(self): + ( + report.cumulative_download_count, + report.cumulative_download_session_count, + ) = self._countedusage_download_counts(osf_obj, cumulative=True) + + def _base_usage_search(self, cumulative: bool = False): + timestamp_filter = { + 'lt': self.yearmonth.month_end(), + } + if not cumulative: + timestamp_filter['gte'] = self.yearmonth.month_start() return ( CountedAuthUsage.search() .filter('term', item_public=True) - .filter('range', timestamp={ - 'gte': self.yearmonth.month_start(), - 'lt': self.yearmonth.month_end(), - }) + .filter('range', timestamp=timestamp_filter) .extra(size=0) # only aggregations, no hits ) - def _countedusage_view_counts(self, osf_obj) -> tuple[int, int]: + def _countedusage_view_counts(self, osf_obj, cumulative: bool = False) -> tuple[int, int]: '''compute view_session_count separately to avoid double-counting (the same session may be represented in both the composite agg on `item_guid` and that on `surrounding_guids`) ''' _search = ( - self._base_usage_search() + self._base_usage_search(cumulative=cumulative) .query( 'bool', filter=[ @@ -206,10 +233,10 @@ def _countedusage_view_counts(self, osf_obj) -> tuple[int, int]: ) return (_view_count, _view_session_count) - def _countedusage_download_counts(self, osf_obj) -> tuple[int, int]: + def _countedusage_download_counts(self, osf_obj, cumulative: bool = False) -> tuple[int, int]: '''aggregate downloads on each osfid (not including components/files)''' _search = ( - self._base_usage_search() + self._base_usage_search(cumulative=cumulative) .filter('term', item_guid=osf_obj._id) .filter('term', action_labels=CountedAuthUsage.ActionLabel.DOWNLOAD.value) ) diff --git a/osf/metrics/reporters/spam_count.py b/osf/metrics/reporters/spam_count.py index 319381fe899..2fbac671ad1 100644 --- a/osf/metrics/reporters/spam_count.py +++ b/osf/metrics/reporters/spam_count.py @@ -1,9 +1,11 @@ from osf.models import OSFUser from osf.metrics.reports import SpamSummaryReport -from ._base import MonthlyReporter from osf.models import PreprintLog, NodeLog from osf.models.spam import SpamStatus +from osf.metrics.es8_metrics import MonthlySpamSummaryReportEs8 +from osf.metrics.utils import cycle_coverage_yearmonth +from ._base import MonthlyReporter class SpamCountReporter(MonthlyReporter): @@ -11,9 +13,9 @@ def report(self, **report_kwargs): assert not report_kwargs target_month = self.yearmonth.month_start() next_month = self.yearmonth.month_end() - - return SpamSummaryReport( - # Node Log entries + reports = [] + report_es8 = MonthlySpamSummaryReportEs8( + cycle_coverage=cycle_coverage_yearmonth(self.yearmonth), node_confirmed_spam=NodeLog.objects.filter( action=NodeLog.CONFIRM_SPAM, created__gt=target_month, @@ -79,3 +81,23 @@ def report(self, **report_kwargs): created__lt=next_month, ).count() ) + reports.append(report_es8) + report = SpamSummaryReport( + # Node Log entries + node_confirmed_spam=report_es8.node_confirmed_spam, + node_confirmed_ham=report_es8.node_confirmed_ham, + node_flagged=report_es8.node_flagged, + # Registration Log entries + registration_confirmed_spam=report_es8.registration_confirmed_spam, + registration_confirmed_ham=report_es8.registration_confirmed_ham, + registration_flagged=report_es8.registration_flagged, + # Preprint Log entries + preprint_confirmed_spam=report_es8.preprint_confirmed_spam, + preprint_confirmed_ham=report_es8.preprint_confirmed_ham, + preprint_flagged=report_es8.preprint_flagged, + # New Users marked as Spam/Ham + user_marked_as_spam=report_es8.user_marked_as_spam, + user_marked_as_ham=report_es8.user_marked_as_ham, + ) + reports.append(report) + return reports diff --git a/osf/metrics/reporters/storage_addon_usage.py b/osf/metrics/reporters/storage_addon_usage.py index 704254795f0..893373cebd1 100644 --- a/osf/metrics/reporters/storage_addon_usage.py +++ b/osf/metrics/reporters/storage_addon_usage.py @@ -13,6 +13,12 @@ from osf.metrics.reports import StorageAddonUsage, RunningTotal, UsageByStorageAddon from osf.models import SpamStatus, Tag from website import settings +from osf.metrics.es8_metrics import ( + DailyStorageAddonUsageReportEs8, + UsageByStorageAddon as UsageByStorageAddonEs8, + RunningTotal as RunningTotalEs8 +) +from osf.metrics.utils import cycle_coverage_date from ._base import DailyReporter logger = logging.getLogger(__name__) @@ -125,45 +131,89 @@ def report(self, date): if 'storage' in addon_config.categories } + usage_by_addon_es8 = [] usage_by_addon = [] for short_name, addon_config in storage_addon_configs.items(): - user_counts = storage_addon_user_counts(date, addon_config.get_model('UserSettings')) - node_counts = storage_addon_node_counts(date, addon_config.get_model('NodeSettings')) - - usage_by_addon.append( - UsageByStorageAddon( - addon_shortname=short_name, - enabled_usersettings=RunningTotal( - total=user_counts.get('enabled_total', 0), - total_daily=user_counts.get('enabled_daily', 0), - ), - deleted_usersettings=RunningTotal( - total=user_counts.get('deleted_total', 0), - total_daily=user_counts.get('deleted_daily', 0), - ), - linked_usersettings=RunningTotal( - total=user_counts.get('linked_total', 0), - total_daily=user_counts.get('linked_daily', 0), - ), - usersetting_links=RunningTotal( - total=user_counts.get('link_count_total', 0), - total_daily=user_counts.get('link_count_daily', 0), - ), - connected_nodesettings=RunningTotal( - total=node_counts.get('connected_total', 0), - total_daily=node_counts.get('connected_daily', 0), - ), - disconnected_nodesettings=RunningTotal( - total=node_counts.get('disconnected_total', 0), - total_daily=node_counts.get('disconnected_daily', 0), - ), - deleted_nodesettings=RunningTotal( - total=node_counts.get('deleted_total', 0), - total_daily=node_counts.get('deleted_daily', 0), - ), + try: + _UserSettings = addon_config.get_model('UserSettings') + _NodeSettings = addon_config.get_model('NodeSettings') + except LookupError: + continue + user_counts = storage_addon_user_counts(date, _UserSettings) + node_counts = storage_addon_node_counts(date, _NodeSettings) + usage_by_storage_addon_es_8 = UsageByStorageAddonEs8( + addon_shortname=short_name, + enabled_usersettings=RunningTotalEs8( + total=user_counts.get('enabled_total', 0), + total_daily=user_counts.get('enabled_daily', 0), + ), + deleted_usersettings=RunningTotalEs8( + total=user_counts.get('deleted_total', 0), + total_daily=user_counts.get('deleted_daily', 0), + ), + linked_usersettings=RunningTotalEs8( + total=user_counts.get('linked_total', 0), + total_daily=user_counts.get('linked_daily', 0), + ), + usersetting_links=RunningTotalEs8( + total=user_counts.get('link_count_total', 0), + total_daily=user_counts.get('link_count_daily', 0), + ), + connected_nodesettings=RunningTotalEs8( + total=node_counts.get('connected_total', 0), + total_daily=node_counts.get('connected_daily', 0), + ), + disconnected_nodesettings=RunningTotalEs8( + total=node_counts.get('disconnected_total', 0), + total_daily=node_counts.get('disconnected_daily', 0), + ), + deleted_nodesettings=RunningTotalEs8( + total=node_counts.get('deleted_total', 0), + total_daily=node_counts.get('deleted_daily', 0), + ), + ) + usage_by_addon_es8.append(usage_by_storage_addon_es_8) + usage_by_storage_addon = UsageByStorageAddon( + addon_shortname=usage_by_storage_addon_es_8.addon_shortname, + enabled_usersettings=RunningTotal( + total=usage_by_storage_addon_es_8.enabled_usersettings.total, + total_daily=usage_by_storage_addon_es_8.enabled_usersettings.total_daily, + ), + deleted_usersettings=RunningTotal( + total=usage_by_storage_addon_es_8.deleted_usersettings.total, + total_daily=usage_by_storage_addon_es_8.deleted_usersettings.total_daily, + ), + linked_usersettings=RunningTotal( + total=usage_by_storage_addon_es_8.linked_usersettings.total, + total_daily=usage_by_storage_addon_es_8.linked_usersettings.total_daily, + ), + usersetting_links=RunningTotal( + total=usage_by_storage_addon_es_8.usersetting_links.total, + total_daily=usage_by_storage_addon_es_8.usersetting_links.total_daily, + ), + connected_nodesettings=RunningTotal( + total=usage_by_storage_addon_es_8.connected_nodesettings.total, + total_daily=usage_by_storage_addon_es_8.connected_nodesettings.total_daily, + ), + disconnected_nodesettings=RunningTotal( + total=usage_by_storage_addon_es_8.disconnected_nodesettings.total, + total_daily=usage_by_storage_addon_es_8.disconnected_nodesettings.total_daily, + ), + deleted_nodesettings=RunningTotal( + total=usage_by_storage_addon_es_8.deleted_nodesettings.total, + total_daily=usage_by_storage_addon_es_8.deleted_nodesettings.total_daily, ) ) - return [StorageAddonUsage( + usage_by_addon.append(usage_by_storage_addon) + reports = [] + report_es8 = DailyStorageAddonUsageReportEs8( + cycle_coverage=cycle_coverage_date(date), + usage_by_addon=usage_by_addon, + ) + reports.append(report_es8) + report = StorageAddonUsage( report_date=date, usage_by_addon=usage_by_addon, - )] + ) + reports.append(report) + return reports diff --git a/osf/metrics/reporters/user_count.py b/osf/metrics/reporters/user_count.py index e0a61c7bb10..121b830c466 100644 --- a/osf/metrics/reporters/user_count.py +++ b/osf/metrics/reporters/user_count.py @@ -1,14 +1,17 @@ from osf.models import OSFUser from osf.metrics import UserSummaryReport +from osf.metrics.es8_metrics import DailyUserSummaryReportEs8 +from osf.metrics.utils import cycle_coverage_date from ._base import DailyReporter class UserCountReporter(DailyReporter): def report(self, report_date): - report = UserSummaryReport( - report_date=report_date, + reports = [] + report_es8 = DailyUserSummaryReportEs8( + cycle_coverage=cycle_coverage_date(report_date), active=OSFUser.objects.filter(is_active=True, date_confirmed__date__lte=report_date).count(), deactivated=OSFUser.objects.filter(date_disabled__isnull=False, date_disabled__date__lte=report_date).count(), merged=OSFUser.objects.filter(date_registered__date__lte=report_date, merged_by__isnull=False).count(), @@ -16,5 +19,16 @@ def report(self, report_date): new_users_with_institution_daily=OSFUser.objects.filter(is_active=True, date_confirmed__date=report_date, institutionaffiliation__isnull=False).count(), unconfirmed=OSFUser.objects.filter(date_registered__date__lte=report_date, date_confirmed__isnull=True).count(), ) + reports.append(report_es8) + report = UserSummaryReport( + report_date=report_date, + active=report_es8.active, + deactivated=report_es8.deactivated, + merged=report_es8.merged, + new_users_daily=report_es8.new_users_daily, + new_users_with_institution_daily=report_es8.new_users_with_institution_daily, + unconfirmed=report_es8.unconfirmed, + ) + reports.append(report) - return [report] + return reports diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py index ffbcfb4c9b8..62479e359cd 100644 --- a/osf/metrics/reports.py +++ b/osf/metrics/reports.py @@ -4,7 +4,7 @@ from django.dispatch import receiver from elasticsearch6_dsl import InnerDoc -from elasticsearch_metrics import metrics +import elasticsearch_metrics.imps.elastic6 as metrics from elasticsearch_metrics.signals import pre_save as metrics_pre_save from osf.metrics.utils import stable_key, YearMonth @@ -120,6 +120,8 @@ def save(self, *args, **kwargs): @receiver(metrics_pre_save) def set_report_id(sender, instance, **kwargs): + if not issubclass(sender, metrics.Metric): + return # skip es8 record types try: _unique_together_fields = instance.UNIQUE_TOGETHER_FIELDS except AttributeError: diff --git a/osf/metrics/utils.py b/osf/metrics/utils.py index 973b8bf1ef3..c5d49f293cf 100644 --- a/osf/metrics/utils.py +++ b/osf/metrics/utils.py @@ -6,6 +6,28 @@ from hashlib import sha256 from typing import ClassVar +from elasticsearch_metrics.util.timeparts import format_timeparts + + +def cycle_coverage_date(given_date: datetime.date) -> str: + """ + >>> cycle_coverage_date(datetime.date(1234, 5, 6)) + '1234.5.6' + >>> cycle_coverage_date(datetime.datetime(7654, 3, 2, 1)) + '7654.3.2' + """ + return format_timeparts(given_date, 3) + + +def cycle_coverage_yearmonth(given_ym: YearMonth | datetime.date) -> str: + """ + >>> cycle_coverage_yearmonth(YearMonth(2222, 33)) + '2222.33' + >>> cycle_coverage_yearmonth(datetime.date(1234, 5, 6)) + '1234.5' + """ + return format_timeparts((given_ym.year, given_ym.month), 2) + def stable_key(*key_parts): """hash function for use in osf.metrics diff --git a/osf/models/registrations.py b/osf/models/registrations.py index e1d819b43bf..f13489f1201 100644 --- a/osf/models/registrations.py +++ b/osf/models/registrations.py @@ -14,15 +14,18 @@ UserObjectPermissionBase, ) from dirtyfields import DirtyFieldsMixin +import waffle from framework.auth import Auth from framework.exceptions import PermissionsError +from osf import features from osf.models import Identifier from osf.utils.fields import NonNaiveDateTimeField, LowercaseCharField from osf.utils.permissions import ADMIN, READ, WRITE from osf.exceptions import NodeStateError, DraftRegistrationStateError from osf.external.internet_archive.tasks import archive_to_ia, update_ia_metadata from osf.metrics import RegistriesModerationMetrics +from osf.metrics.es8_metrics import RegistriesModerationEventEs8 from osf.models.notification_type import NotificationTypeEnum from .action import RegistrationAction from .archive import ArchiveJob @@ -782,7 +785,17 @@ def _write_registration_action(self, from_state, to_state, initiated_by, comment comment=comment ) action.save() - RegistriesModerationMetrics.record_transitions(action) + if waffle.switch_is_active(features.ELASTICSEARCH_METRICS): + RegistriesModerationMetrics.record_transitions(action) + RegistriesModerationEventEs8.record( + registration_id=action.target._id, + provider_id=action.target.provider._id, + from_state=action.from_state, + to_state=action.to_state, + trigger=action.trigger, + user_id=action.creator._id, + comment=action.comment, + ) moderation_notifications = { RegistrationModerationTriggers.SUBMIT: notify.notify_submit, diff --git a/osf_tests/management_commands/test_reindex_es6.py b/osf_tests/management_commands/test_reindex_es6.py index 5e01be656a8..36158c18da6 100644 --- a/osf_tests/management_commands/test_reindex_es6.py +++ b/osf_tests/management_commands/test_reindex_es6.py @@ -10,7 +10,7 @@ AuthUserFactory ) -from elasticsearch_metrics.field import Keyword +from elasticsearch6_dsl import Keyword from tests.json_api_test_app import JSONAPITestApp diff --git a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py index 05baa4d38e7..f40b5dacec6 100644 --- a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py +++ b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py @@ -3,6 +3,7 @@ import logging from django.test import TestCase from osf.metrics.reporters import InstitutionalSummaryMonthlyReporter +from osf.metrics.reports import InstitutionMonthlySummaryReport from osf.metrics.utils import YearMonth from osf_tests.factories import ( InstitutionFactory, @@ -79,10 +80,10 @@ def _create_active_user(cls, institution, date_confirmed): def test_report_generation(self): reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) - reports = list_monthly_reports(reporter) - self.assertEqual(len(reports), 1) + reports_raw = list_monthly_reports(reporter) + self.assertEqual(len(reports_raw[0]), 2) - report = reports[0] + report = next(r for r in reports_raw[0] if isinstance(r, InstitutionMonthlySummaryReport)) self.assertEqual(report.institution_id, self._institution._id) self.assertEqual(report.user_count, 2) # _logged_in_user and _active_user self.assertEqual(report.public_project_count, 1) @@ -115,7 +116,8 @@ def test_report_generation_multiple_institutions(self): # Run the reporter for the current month (February 2018) reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) - reports = list_monthly_reports(reporter) + reports_raw = list_monthly_reports(reporter) + reports = [item for sublist in reports_raw for item in sublist if isinstance(item, InstitutionMonthlySummaryReport)] self.assertEqual(len(reports), 3) # Reports for self._institution, institution2, institution3 # Extract reports by institution @@ -264,7 +266,8 @@ def test_high_counts_multiple_institutions(self): if enable_benchmarking: reporter_start_time = time.time() reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth) - reports = list_monthly_reports(reporter) + reports_raw = list_monthly_reports(reporter) + reports = [item for sublist in reports_raw for item in sublist if isinstance(item, InstitutionMonthlySummaryReport)] assert len(reports) == additional_institution_count + 1 if enable_benchmarking: diff --git a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py index 69bd266285a..082b330afd8 100644 --- a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py +++ b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py @@ -174,8 +174,10 @@ def test_no_data(self, ym_empty): def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_month_item0, busy_month_item1, busy_month_item2, item0): _empty = list_monthly_reports(PublicItemUsageReporter(ym_empty)) - _sparse = list_monthly_reports(PublicItemUsageReporter(ym_sparse)) - _busy = list_monthly_reports(PublicItemUsageReporter(ym_busy)) + _sparse_raw = list_monthly_reports(PublicItemUsageReporter(ym_sparse)) + _sparse = [item for sublist in _sparse_raw for item in sublist if isinstance(item, PublicItemUsageReport)] + _busy_raw = list_monthly_reports(PublicItemUsageReporter(ym_busy)) + _busy = [item for sublist in _busy_raw for item in sublist if isinstance(item, PublicItemUsageReport)] # empty month: assert _empty == [] diff --git a/osf_tests/metrics/test_daily_report.py b/osf_tests/metrics/test_daily_report.py index 46375184f95..5228e2342c5 100644 --- a/osf_tests/metrics/test_daily_report.py +++ b/osf_tests/metrics/test_daily_report.py @@ -2,7 +2,7 @@ from unittest import mock import pytest -from elasticsearch_metrics import metrics +import elasticsearch_metrics.imps.elastic6 as metrics from osf.metrics.reports import DailyReport, ReportInvalid @@ -10,8 +10,9 @@ class TestDailyReportKey: @pytest.fixture def mock_save(self): - with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: - yield mock_save + with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'): + with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: + yield mock_save def test_default(self, mock_save): # only one of this type of report per day diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py new file mode 100644 index 00000000000..5bc6e4c4bc4 --- /dev/null +++ b/osf_tests/metrics/test_es8_metrics.py @@ -0,0 +1,92 @@ +import datetime + +from elasticsearch_metrics.tests.util import djelme_test_backends +import pytest + +from osf.metrics.es8_metrics import ( + PageviewInfo, + DailyDownloadCountReportEs8, + OsfCountedUsageEvent, +) + + +class TestEs8Metrics: + """smoke tests to check that djelme records can be saved and searched""" + + @pytest.fixture(autouse=True) + def _real_elastic(self): + with djelme_test_backends(): + yield + + def test_nested_pageview_autofill(self): + usage = OsfCountedUsageEvent.record( + timestamp=datetime.datetime(2024, 1, 1, 15, 0, tzinfo=datetime.UTC), + sessionhour_id='blah', + database_iri='https://osf.example/provider', + item_iri='https://osf.example/itemm', + item_osfid='itemm', + item_public=True, + item_type='https://osf.example/Preprint', + platform_iri='https://osf.example', + user_is_authenticated=False, + pageview_info=PageviewInfo( + page_url='https://example.com/path/test', + referer_url='https://google.com', + route_name='foo.bar', + page_title='title title', + ), + ) + assert usage.pageview_info.page_path == '/path/test' + assert usage.pageview_info.referer_domain == 'google.com' + assert usage.pageview_info.hour_of_day == 15 + assert usage.item_iri in usage.within_iris + + def test_nested_pageview_autofill_dict(self): + usage = OsfCountedUsageEvent.record( + timestamp=datetime.datetime(2024, 1, 1, 15, 0, tzinfo=datetime.UTC), + sessionhour_id='blah', + database_iri='https://osf.example/provider', + item_iri='https://osf.example/itemm', + item_osfid='itemm', + item_public=True, + item_type='https://osf.example/Preprint', + platform_iri='https://osf.example', + user_is_authenticated=False, + pageview_info={ + 'page_url': 'https://example.com/path/test', + 'referer_url': 'https://google.com', + 'route_name': 'foo.bar', + 'page_title': 'title title', + }, + ) + assert usage.pageview_info.page_path == '/path/test' + assert usage.pageview_info.referer_domain == 'google.com' + assert usage.pageview_info.hour_of_day == 15 + assert usage.item_iri in usage.within_iris + + def test_none_pageview_nested_autofill(self): + usage = OsfCountedUsageEvent.record( + timestamp=datetime.datetime(2024, 1, 1, 15, 0, tzinfo=datetime.UTC), + sessionhour_id='blah', + database_iri='https://osf.example/provider', + item_iri='https://osf.example/itemm', + item_osfid='itemm', + item_public=True, + item_type='https://osf.example/Preprint', + platform_iri='https://osf.example', + user_is_authenticated=False, + ) + assert not usage.pageview_info + assert usage.item_iri in usage.within_iris + + def test_save_report(self): + _saved = DailyDownloadCountReportEs8.record( + cycle_coverage='2026.1.1', + daily_file_downloads=17, + ) + DailyDownloadCountReportEs8.refresh() + _response = DailyDownloadCountReportEs8.search().execute() + (_fetched,) = _response + assert _fetched.meta.id == _saved.meta.id + assert _fetched.cycle_coverage == '2026.1.1' + assert _fetched.daily_file_downloads == 17 diff --git a/osf_tests/metrics/test_metric_mixin.py b/osf_tests/metrics/test_metric_mixin.py index 4a2c32f7e71..ec9b2d302de 100644 --- a/osf_tests/metrics/test_metric_mixin.py +++ b/osf_tests/metrics/test_metric_mixin.py @@ -1,6 +1,6 @@ from unittest import mock import pytest -from elasticsearch_metrics import metrics +import elasticsearch_metrics.imps.elastic6 as metrics from osf.metrics.metric_mixin import MetricMixin from osf.models import OSFUser diff --git a/osf_tests/metrics/test_monthly_report.py b/osf_tests/metrics/test_monthly_report.py index 3c841e6555c..ba981e997d6 100644 --- a/osf_tests/metrics/test_monthly_report.py +++ b/osf_tests/metrics/test_monthly_report.py @@ -2,7 +2,7 @@ from unittest import mock import pytest -from elasticsearch_metrics import metrics +import elasticsearch_metrics.imps.elastic6 as metrics from osf.metrics.reports import MonthlyReport, ReportInvalid, PublicItemUsageReport from osf.metrics.utils import YearMonth @@ -11,8 +11,9 @@ class TestMonthlyReportKey: @pytest.fixture def mock_save(self): - with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: - yield mock_save + with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'): + with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: + yield mock_save def test_default(self, mock_save): # only one of this type of report per month @@ -79,6 +80,7 @@ class Meta: @pytest.mark.es_metrics +@pytest.mark.django_db class TestLastMonthReport: @pytest.fixture def osfid(self): diff --git a/osf_tests/metrics/test_spam_count_reporter.py b/osf_tests/metrics/test_spam_count_reporter.py index 0e7ba6956bf..448a8136f7a 100644 --- a/osf_tests/metrics/test_spam_count_reporter.py +++ b/osf_tests/metrics/test_spam_count_reporter.py @@ -1,6 +1,7 @@ import pytest from datetime import datetime from osf.metrics.reporters.private_spam_metrics import PrivateSpamMetricsReporter +from osf.metrics.reports import PrivateSpamMetricsReport from osf.metrics.utils import YearMonth from osf_tests.factories import NodeLogFactory, NodeFactory from unittest.mock import patch @@ -30,7 +31,8 @@ def test_private_spam_metrics_reporter(): mock_akismet_get_hammed_count.return_value = 10 reporter = PrivateSpamMetricsReporter(report_yearmonth) - report = reporter.report() + reports_raw = reporter.report() + report = next(r for r in reports_raw if isinstance(r, PrivateSpamMetricsReport)) assert report.node_oopspam_flagged == 10, f"Expected 10, got {report.node_oopspam_flagged}" assert report.node_oopspam_hammed == 5, f"Expected 5, got {report.node_oopspam_hammed}" diff --git a/osf_tests/metrics/test_utils.py b/osf_tests/metrics/test_utils.py index a9d312f2331..47f16be6404 100644 --- a/osf_tests/metrics/test_utils.py +++ b/osf_tests/metrics/test_utils.py @@ -1,15 +1,20 @@ -from datetime import date +import datetime import pytest -from osf.metrics.utils import stable_key +from osf.metrics.utils import ( + stable_key, + cycle_coverage_date, + cycle_coverage_yearmonth, + YearMonth, +) class TestStableKey: @pytest.mark.parametrize('args, expected_key', [ (['foo'], '2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae'), - ([date(1953, 7, 2)], '3943be98daa91031ee7d0e0765472ce1b4a50a21f8c6dcd31047d530a50ada93'), - (['floo', 'blar', date(3049, 2, 2)], '853cef24d58fa8cd69b20d7dfbcdbd33f20ccda1a14f57e25e43c2533504b64f'), + ([datetime.date(1953, 7, 2)], '3943be98daa91031ee7d0e0765472ce1b4a50a21f8c6dcd31047d530a50ada93'), + (['floo', 'blar', datetime.date(3049, 2, 2)], '853cef24d58fa8cd69b20d7dfbcdbd33f20ccda1a14f57e25e43c2533504b64f'), ([1, 2, 7.3], '6ab892f8109fd23b03ab24aebc4e343ed2a058d9a72f750bf90ba051627d233e'), ]) def test_successes(self, args, expected_key): @@ -24,3 +29,13 @@ def test_successes(self, args, expected_key): def test_value_errors(self, args): with pytest.raises(ValueError): stable_key(*args) + + +def test_cycle_coverage_date(): + assert cycle_coverage_date(datetime.date(1234, 5, 6)) == '1234.5.6' + assert cycle_coverage_date(datetime.datetime(7654, 3, 2, 1)) == '7654.3.2' + + +def test_cycle_coverage_yearmonth(): + assert cycle_coverage_yearmonth(YearMonth(2222, 33)) == '2222.33' + assert cycle_coverage_yearmonth(datetime.date(1234, 5, 6)) == '1234.5' diff --git a/poetry.lock b/poetry.lock index bfcd4c5766f..5648455ccbe 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.3.2 and should not be changed by hand. [[package]] name = "amqp" @@ -1085,27 +1085,24 @@ Django = ">=2.0" [[package]] name = "django-elasticsearch-metrics" -version = "2022.0.6" +version = "2026.0.4" description = "Django app for storing time-series metrics in Elasticsearch." optional = false -python-versions = "*" +python-versions = ">=3.10,<4" groups = ["main"] files = [] develop = false -[package.dependencies] -elasticsearch6-dsl = ">=6.3.0,<7.0.0" - [package.extras] -dev = ["factory-boy (==2.11.1)", "flake8 (==5.0.4)", "flake8-bugbear (==18.8.0)", "konch (>=3.0.0)", "mock", "pre-commit (==2.17.0)", "pytest", "pytest-django (==3.10.0)", "tox"] -lint = ["flake8 (==5.0.4)", "flake8-bugbear (==18.8.0)", "pre-commit (==2.17.0)"] -tests = ["factory-boy (==2.11.1)", "mock", "pytest", "pytest-django (==3.10.0)"] +anydjango = ["django"] +elastic6 = ["elasticsearch6-dsl (>=6.3.0,<7.0.0)"] +elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "f5b9312914154e213aa01731e934c593e3434269" -resolved_reference = "f5b9312914154e213aa01731e934c593e3434269" +reference = "f2b92e5509389bb6c33f5a90c9ca4fe4e68187e2" +resolved_reference = "f2b92e5509389bb6c33f5a90c9ca4fe4e68187e2" [[package]] name = "django-extensions" @@ -1189,7 +1186,7 @@ files = [ [package.dependencies] autopep8 = "*" Django = ">=3.2" -gprof2dot = ">=2017.09.19" +gprof2dot = ">=2017.9.19" sqlparse = "*" [[package]] @@ -1361,14 +1358,14 @@ stone = ">=2" [[package]] name = "elastic-transport" -version = "8.13.0" +version = "8.17.1" description = "Transport classes and utilities shared among Python Elastic client libraries" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" groups = ["main"] files = [ - {file = "elastic-transport-8.13.0.tar.gz", hash = "sha256:2410ec1ff51221e8b3a01c0afa9f0d0498e1386a269283801f5c12f98e42dc45"}, - {file = "elastic_transport-8.13.0-py3-none-any.whl", hash = "sha256:aec890afdddd057762b27ff3553b0be8fa4673ec1a4fd922dfbd00325874bb3d"}, + {file = "elastic_transport-8.17.1-py3-none-any.whl", hash = "sha256:192718f498f1d10c5e9aa8b9cf32aed405e469a7f0e9d6a8923431dbb2c59fb8"}, + {file = "elastic_transport-8.17.1.tar.gz", hash = "sha256:5edef32ac864dca8e2f0a613ef63491ee8d6b8cfb52881fa7313ba9290cac6d2"}, ] [package.dependencies] @@ -1376,46 +1373,7 @@ certifi = "*" urllib3 = ">=1.26.2,<3" [package.extras] -develop = ["aiohttp", "furo", "httpx", "mock", "opentelemetry-api", "opentelemetry-sdk", "orjson", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "respx", "sphinx (>2)", "sphinx-autodoc-typehints", "trustme"] - -[[package]] -name = "elasticsearch" -version = "6.8.2" -description = "Python client for Elasticsearch" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, <4" -groups = ["main"] -files = [ - {file = "elasticsearch-6.8.2-py2.py3-none-any.whl", hash = "sha256:1aedf00b73f5d1e77cb4df70fec58f2efb664be4ce2686374239aa6c0373c65c"}, - {file = "elasticsearch-6.8.2.tar.gz", hash = "sha256:c3a560bb83e4981b5a5c82080d2ceb99686d33692ef53365656129478aa5ddb2"}, -] - -[package.dependencies] -urllib3 = ">=1.21.1" - -[package.extras] -develop = ["coverage", "mock", "nose", "nosexcover", "numpy", "pandas", "pyyaml", "requests (>=2.0.0,<3.0.0)", "sphinx (<1.7)", "sphinx-rtd-theme"] -requests = ["requests (>=2.4.0,<3.0.0)"] - -[[package]] -name = "elasticsearch-dsl" -version = "6.4.0" -description = "Python client for Elasticsearch" -optional = false -python-versions = "*" -groups = ["main"] -files = [ - {file = "elasticsearch-dsl-6.4.0.tar.gz", hash = "sha256:26416f4dd46ceca43d62ef74970d9de4bdd6f4b0f163316f0b432c9e61a08bec"}, - {file = "elasticsearch_dsl-6.4.0-py2.py3-none-any.whl", hash = "sha256:f60aea7fd756ac1fbe7ce114bbf4949aefbf495dfe8896640e787c67344f12f6"}, -] - -[package.dependencies] -elasticsearch = ">=6.0.0,<7.0.0" -python-dateutil = "*" -six = "*" - -[package.extras] -develop = ["coverage (<5.0.0)", "mock", "pytest (>=3.0.0)", "pytest-cov", "pytz", "sphinx", "sphinx-rtd-theme"] +develop = ["aiohttp", "furo", "httpx", "opentelemetry-api", "opentelemetry-sdk", "orjson", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "respx", "sphinx (>2)", "sphinx-autodoc-typehints", "trustme"] [[package]] name = "elasticsearch2" @@ -1471,6 +1429,32 @@ six = "*" [package.extras] develop = ["coverage (<5.0.0)", "mock", "pytest (>=3.0.0)", "pytest-cov", "pytz", "sphinx", "sphinx-rtd-theme"] +[[package]] +name = "elasticsearch8" +version = "8.19.3" +description = "Python client for Elasticsearch" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "elasticsearch8-8.19.3-py3-none-any.whl", hash = "sha256:4b52e59e68aea6f59bf37c28f6f4512333302dd8a52e26c17d0f10c076d833a1"}, + {file = "elasticsearch8-8.19.3.tar.gz", hash = "sha256:7effe95b360241b6d56ef68219037a90ad0f56723614db54bbe57d33058402f4"}, +] + +[package.dependencies] +elastic-transport = ">=8.15.1,<9" +python-dateutil = "*" +typing-extensions = "*" + +[package.extras] +async = ["aiohttp (>=3,<4)"] +dev = ["aiohttp", "black", "build", "coverage", "isort", "jinja2", "mapbox-vector-tile", "mypy", "nox", "numpy", "orjson", "pandas", "pyarrow ; python_version < \"3.14\"", "pyright", "pytest", "pytest-asyncio", "pytest-cov", "pytest-mock", "python-dateutil", "pyyaml (>=5.4)", "requests (>=2,<3)", "simsimd", "tqdm", "twine", "types-python-dateutil", "types-tqdm", "unasync"] +docs = ["sphinx", "sphinx-autodoc-typehints", "sphinx-rtd-theme (>=2.0)"] +orjson = ["orjson (>=3)"] +pyarrow = ["pyarrow (>=1)"] +requests = ["requests (>=2.4.0,!=2.32.2,<3.0.0)"] +vectorstore-mmr = ["numpy (>=1)", "simsimd (>=3)"] + [[package]] name = "email-validator" version = "2.1.1" @@ -1771,12 +1755,12 @@ files = [ [package.dependencies] google-auth = ">=2.14.1,<3.0.dev0" googleapis-common-protos = ">=1.56.2,<2.0.dev0" -proto-plus = ">=1.22.3,<2.0.0dev" +proto-plus = ">=1.22.3,<2.0.0.dev0" protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" requests = ">=2.18.0,<3.0.0.dev0" [package.extras] -grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\""] +grpc = ["grpcio (>=1.33.2,<2.0.dev0)", "grpcio (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\""] grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] @@ -1852,11 +1836,11 @@ files = [ ] [package.dependencies] -google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0dev" -google-auth = ">=1.25.0,<3.0dev" +google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0.dev0" +google-auth = ">=1.25.0,<3.0.dev0" [package.extras] -grpc = ["grpcio (>=1.38.0,<2.0dev)", "grpcio-status (>=1.38.0,<2.0.dev0)"] +grpc = ["grpcio (>=1.38.0,<2.0.dev0)", "grpcio-status (>=1.38.0,<2.0.dev0)"] [[package]] name = "google-cloud-storage" @@ -1871,15 +1855,15 @@ files = [ ] [package.dependencies] -google-api-core = ">=2.15.0,<3.0.0dev" -google-auth = ">=2.26.1,<3.0dev" -google-cloud-core = ">=2.3.0,<3.0dev" -google-crc32c = ">=1.0,<2.0dev" +google-api-core = ">=2.15.0,<3.0.0.dev0" +google-auth = ">=2.26.1,<3.0.dev0" +google-cloud-core = ">=2.3.0,<3.0.dev0" +google-crc32c = ">=1.0,<2.0.dev0" google-resumable-media = ">=2.6.0" -requests = ">=2.18.0,<3.0.0dev" +requests = ">=2.18.0,<3.0.0.dev0" [package.extras] -protobuf = ["protobuf (<5.0.0dev)"] +protobuf = ["protobuf (<5.0.0.dev0)"] [[package]] name = "google-crc32c" @@ -1934,11 +1918,11 @@ files = [ ] [package.dependencies] -google-crc32c = ">=1.0,<2.0dev" +google-crc32c = ">=1.0,<2.0.dev0" [package.extras] -aiohttp = ["aiohttp (>=3.6.2,<4.0.0dev)", "google-auth (>=1.22.0,<2.0dev)"] -requests = ["requests (>=2.18.0,<3.0.0dev)"] +aiohttp = ["aiohttp (>=3.6.2,<4.0.0.dev0)", "google-auth (>=1.22.0,<2.0.dev0)"] +requests = ["requests (>=2.18.0,<3.0.0.dev0)"] [[package]] name = "googleapis-common-protos" @@ -2317,7 +2301,7 @@ files = [ [package.dependencies] attrs = ">=22.2.0" -jsonschema-specifications = ">=2023.03.6" +jsonschema-specifications = ">=2023.3.6" referencing = ">=0.28.4" rpds-py = ">=0.7.1" @@ -3074,7 +3058,7 @@ files = [ ] [package.dependencies] -protobuf = ">=3.19.0,<6.0.0dev" +protobuf = ">=3.19.0,<6.0.0.dev0" [package.extras] testing = ["google-api-core (>=1.31.5)"] @@ -4104,10 +4088,10 @@ files = [ ] [package.dependencies] -botocore = ">=1.33.2,<2.0a.0" +botocore = ">=1.33.2,<2.0a0" [package.extras] -crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"] +crt = ["botocore[crt] (>=1.33.2,<2.0a0)"] [[package]] name = "schema" @@ -4428,6 +4412,18 @@ files = [ {file = "types_python_dateutil-2.9.0.20240906-py3-none-any.whl", hash = "sha256:27c8cc2d058ccb14946eebcaaa503088f4f6dbc4fb6093d3d456a49aef2753f6"}, ] +[[package]] +name = "typing-extensions" +version = "4.15.0" +description = "Backported and Experimental Type Hints for Python 3.9+" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, + {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, +] + [[package]] name = "tzdata" version = "2024.1" @@ -4731,4 +4727,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "dfca5743cef25a20317ca8e7625404a7acbf9050461d7098684674ec109e41ee" +content-hash = "e323eef9d1d9eb45133fd2815ac003b7cf3d7a25682df321165ed091ab49436a" diff --git a/pyproject.toml b/pyproject.toml index 4a776719fac..e47e851fc44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,9 +31,10 @@ Markupsafe = "2.1.5" blinker = "1.7.0" furl = "2.1.3" elasticsearch2 = "2.5.1" -elasticsearch = "6.8.2" # max version to support elasticsearch6 -elasticsearch-dsl = "6.4.0" # max version to support elasticsearch6 -elastic-transport = "8.13.0" +elasticsearch6= "6.8.2" +elasticsearch6-dsl = "6.4.0" +elasticsearch8 = "8.19.3" +elastic-transport = "8.17.1" google-api-python-client = "2.123.0" google-auth = "2.29.0" Babel = "2.14.0" @@ -90,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "f5b9312914154e213aa01731e934c593e3434269"} # branch is feature/pin-esdsl +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "f2b92e5509389bb6c33f5a90c9ca4fe4e68187e2"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" diff --git a/website/settings/defaults.py b/website/settings/defaults.py index fbe9b939ae1..ca95f9f0dab 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -107,6 +107,11 @@ def parent_dir(path): SEARCH_ENGINE = 'elastic' # Can be 'elastic', or None ELASTIC_URI = '127.0.0.1:9200' ELASTIC6_URI = os.environ.get('ELASTIC6_URI', '127.0.0.1:9201') +ELASTIC8_URI = os.environ.get('ELASTIC8_URI') +ELASTIC8_CERT_PATH = os.environ.get('ELASTIC8_CERT_PATH') +ELASTIC8_ASSERT_HOSTNAME = os.environ.get('ELASTIC8_ASSERT_HOSTNAME') +ELASTIC8_USERNAME = os.environ.get('ELASTIC8_USERNAME', 'elastic') +ELASTIC8_SECRET = os.environ.get('ELASTIC8_SECRET') ELASTIC_TIMEOUT = 10 ELASTIC_INDEX = 'website' ELASTIC_KWARGS = { @@ -411,6 +416,7 @@ class CeleryConfig: task_account_status_changes_queue = 'account_status_changes' task_external_high_queue = 'external_high' task_external_low_queue = 'external_low' + task_background_migration_queue = 'background_migration' external_high_modules = { 'osf.tasks.log_gv_addon', @@ -476,6 +482,10 @@ class CeleryConfig: 'scripts.enhanced_stuck_registration_audit', } + background_migration_modules = { + 'osf.management.commands.migrate_osfmetrics_6to8', + } + try: from kombu import Queue, Exchange except ImportError: @@ -529,12 +539,19 @@ class CeleryConfig: routing_key=task_external_low_queue, consumer_arguments={'x-priority': -2}, ), + Queue( + task_background_migration_queue, + Exchange(task_background_migration_queue), + routing_key=task_background_migration_queue, + consumer_arguments={'x-priority': -1}, + ), ) task_default_exchange_type = 'direct' task_routes = ('framework.celery_tasks.routers.CeleryRouter', ) task_ignore_result = True task_store_errors_even_if_ignored = True + result_extended = True broker_url = os.environ.get('BROKER_URL', f'amqp://{RABBITMQ_USERNAME}:{RABBITMQ_PASSWORD}@{RABBITMQ_HOST}:{RABBITMQ_PORT}/{RABBITMQ_VHOST}') broker_use_ssl = False @@ -583,6 +600,7 @@ class CeleryConfig: 'scripts.remove_after_use.merge_notification_subscription_provider_ct', 'scripts.disable_removed_beat_tasks', 'osf.management.commands.delete_withdrawn_or_failed_registration_files', + 'osf.management.commands.migrate_osfmetrics_6to8', ) # Modules that need metrics and release requirements