diff --git a/.docker-compose.env b/.docker-compose.env
index 9cb7a59e274..80eebc8707b 100644
--- a/.docker-compose.env
+++ b/.docker-compose.env
@@ -7,6 +7,8 @@ INTERNAL_DOMAIN=http://192.168.168.167:5000/
API_DOMAIN=http://localhost:8000/
ELASTIC_URI=192.168.168.167:9200
ELASTIC6_URI=192.168.168.167:9201
+ELASTIC8_URI=http://192.168.168.167:9202
+ELASTIC8_USERNAME=elastic
OSF_DB_HOST=192.168.168.167
DB_HOST=192.168.168.167
REDIS_HOST=redis://192.168.168.167:6379
diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml
index 00c5820b5b8..f147941c5ff 100644
--- a/.github/workflows/test-build.yml
+++ b/.github/workflows/test-build.yml
@@ -37,7 +37,19 @@ jobs:
permissions:
checks: write
services:
- postgres:
+ elasticsearch8: &ES8_SERVICE
+ image: elasticsearch:8.19.14
+ ports:
+ - 9202:9200
+ env:
+ discovery.type: single-node
+ xpack.security.enabled: false
+ options: >-
+ --health-cmd "curl -sf http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=30s"
+ --health-interval 10s
+ --health-timeout 30s
+ --health-retries 5
+ postgres: &POSTGRES_SERVICE
image: postgres
env:
POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }}
@@ -54,6 +66,8 @@ jobs:
- uses: ./.github/actions/start-build
- name: Run tests
run: poetry run python3 -m invoke test-ci-addons --junit
+ env:
+ ELASTIC8_URI: http://localhost:9202
- name: Upload report
if: (success() || failure()) # run this step even if previous step failed
uses: ./.github/actions/gen-report
@@ -64,18 +78,7 @@ jobs:
permissions:
checks: write
services:
- postgres:
- image: postgres
- env:
- POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }}
- options: >-
- --health-cmd pg_isready
- --health-interval 10s
- --health-timeout 5s
- --health-retries 5
- ports:
- # Maps tcp port 5432 on service container to the host
- - 5432:5432
+ postgres: *POSTGRES_SERVICE
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/start-build
@@ -91,18 +94,8 @@ jobs:
permissions:
checks: write
services:
- postgres:
- image: postgres
- env:
- POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }}
- options: >-
- --health-cmd pg_isready
- --health-interval 10s
- --health-timeout 5s
- --health-retries 5
- ports:
- # Maps tcp port 5432 on service container to the host
- - 5432:5432
+ elasticsearch8: *ES8_SERVICE
+ postgres: *POSTGRES_SERVICE
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/start-build
@@ -110,6 +103,8 @@ jobs:
run: poetry run python3 -m invoke assets --dev
- name: Run test
run: poetry run python3 -m invoke test-ci-api1-and-js --junit
+ env:
+ ELASTIC8_URI: http://localhost:9202
- name: Upload report
if: (success() || failure()) # run this step even if previous step failed
uses: ./.github/actions/gen-report
@@ -120,23 +115,15 @@ jobs:
permissions:
checks: write
services:
- postgres:
- image: postgres
- env:
- POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }}
- options: >-
- --health-cmd pg_isready
- --health-interval 10s
- --health-timeout 5s
- --health-retries 5
- ports:
- # Maps tcp port 5432 on service container to the host
- - 5432:5432
+ elasticsearch8: *ES8_SERVICE
+ postgres: *POSTGRES_SERVICE
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/start-build
- name: Run tests
run: poetry run python3 -m invoke test-ci-api2 --junit
+ env:
+ ELASTIC8_URI: http://localhost:9202
- name: Upload report
if: (success() || failure()) # run this step even if previous step failed
uses: ./.github/actions/gen-report
@@ -147,19 +134,7 @@ jobs:
checks: write
needs: build-cache
services:
- postgres:
- image: postgres
-
- env:
- POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }}
- options: >-
- --health-cmd pg_isready
- --health-interval 10s
- --health-timeout 5s
- --health-retries 5
- ports:
- # Maps tcp port 5432 on service container to the host
- - 5432:5432
+ postgres: *POSTGRES_SERVICE
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/start-build
@@ -175,19 +150,7 @@ jobs:
checks: write
needs: build-cache
services:
- postgres:
- image: postgres
-
- env:
- POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }}
- options: >-
- --health-cmd pg_isready
- --health-interval 10s
- --health-timeout 5s
- --health-retries 5
- ports:
- # Maps tcp port 5432 on service container to the host
- - 5432:5432
+ postgres: *POSTGRES_SERVICE
mailhog:
image: mailhog/mailhog
ports:
@@ -208,19 +171,7 @@ jobs:
checks: write
needs: build-cache
services:
- postgres:
- image: postgres
-
- env:
- POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }}
- options: >-
- --health-cmd pg_isready
- --health-interval 10s
- --health-timeout 5s
- --health-retries 5
- ports:
- # Maps tcp port 5432 on service container to the host
- - 5432:5432
+ postgres: *POSTGRES_SERVICE
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/start-build
diff --git a/addons/base/views.py b/addons/base/views.py
index 8b4097244b3..12b78fb9957 100644
--- a/addons/base/views.py
+++ b/addons/base/views.py
@@ -14,7 +14,7 @@
import waffle
from django.db import transaction
from django.contrib.contenttypes.models import ContentType
-from elasticsearch import exceptions as es_exceptions
+from elasticsearch6 import exceptions as es_exceptions
from rest_framework import status as http_status
from api.caching.tasks import update_storage_usage_with_size
@@ -34,6 +34,7 @@
from framework.flask import redirect
from framework.sentry import log_exception
from framework.transactions.handlers import no_auto_transaction
+from osf.metrics.es8_metrics import OsfCountedUsageEvent
from website import settings
from addons.base import signals as file_signals
from addons.base.utils import format_last_known_metadata, get_mfr_url
@@ -691,6 +692,18 @@ def osfstoragefile_viewed_update_metrics(self, auth, fileversion, file_node):
version=fileversion.identifier,
path=file_node.path,
)
+ OsfCountedUsageEvent.record(
+ user_id=getattr(user, '_id', None),
+ item_osfid=resource._id,
+ action_labels=[
+ OsfCountedUsageEvent.ActionLabel.VIEW.value,
+ OsfCountedUsageEvent.ActionLabel.WEB.value,
+ ],
+ # HACK: we don't have the user request, so fabricate a one-off session id
+ # (this means no double-click filtering for anonymous users (same as before)
+ # and potentially inflated "unique" sessionhour view counts)
+ client_session_id=str(uuid.uuid4()),
+ )
except es_exceptions.ConnectionError:
log_exception()
@@ -718,6 +731,17 @@ def osfstoragefile_downloaded_update_metrics(self, auth, fileversion, file_node)
version=fileversion.identifier,
path=file_node.path,
)
+ OsfCountedUsageEvent.record(
+ user_id=getattr(user, '_id', None),
+ item_osfid=resource._id,
+ action_labels=[
+ OsfCountedUsageEvent.ActionLabel.DOWNLOAD.value,
+ ],
+ # HACK: we don't have the user request, so fabricate a one-off session id
+ # (this means no double-click filtering for anonymous users (same as before)
+ # and potentially inflated "unique" sessionhour view counts)
+ client_session_id=str(uuid.uuid4()),
+ )
except es_exceptions.ConnectionError:
log_exception()
diff --git a/admin/management/urls.py b/admin/management/urls.py
index c046b3bed18..2e4cd7479a1 100644
--- a/admin/management/urls.py
+++ b/admin/management/urls.py
@@ -1,4 +1,4 @@
-from django.urls import re_path
+from django.urls import re_path, path
from admin.management import views
@@ -21,5 +21,6 @@
re_path(r'^sync_notification_templates', views.SyncNotificationTemplates.as_view(),
name='sync_notification_templates'),
re_path(r'^remove_orcid_from_user_social', views.RemoveOrcidFromUserSocial.as_view(),
- name='remove_orcid_from_user_social')
+ name='remove_orcid_from_user_social'),
+ path('migrate_osfmetrics_6to8', views.MigrateOsfmetrics6to8.as_view(), name='migrate_osfmetrics_6to8'),
]
diff --git a/admin/management/views.py b/admin/management/views.py
index f2052822f37..3c112347529 100644
--- a/admin/management/views.py
+++ b/admin/management/views.py
@@ -1,9 +1,12 @@
+from io import StringIO
+
from dateutil.parser import isoparse
from django.views.generic import TemplateView, View
from django.contrib import messages
from django.http import HttpResponse
from django.utils import timezone
from django.contrib.auth.mixins import PermissionRequiredMixin
+from django.core.management import call_command
from osf.management.commands.manage_switch_flags import manage_waffle
from osf.management.commands.update_registration_schemas import update_registration_schemas
@@ -190,3 +193,23 @@ def post(self, request):
remove_orcid_from_user_social()
messages.success(request, 'Orcid from user social have been successfully removed.')
return redirect(reverse('management:commands'))
+
+
+class MigrateOsfmetrics6to8(ManagementCommandPermissionView):
+ def post(self, request):
+ _command_kwargs = {
+ 'no_setup': True,
+ 'no_color': True,
+ 'no_counts': request.POST.get('no_counts'),
+ 'clear_state': request.POST.get('clear_state'),
+ 'clear_es8_data': request.POST.get('clear_es8_data'),
+ 'start': request.POST.get('start'),
+ 'unchanged': request.POST.get('unchanged'),
+ 'usage_reports': request.POST.get('usage_reports'),
+ 'usage_events': request.POST.get('usage_events'),
+ }
+ _out_io = StringIO()
+ call_command('migrate_osfmetrics_6to8', **_command_kwargs, stdout=_out_io)
+ for _line in _out_io.getvalue().split('\n'):
+ messages.info(request, _line)
+ return redirect(reverse('management:commands'))
diff --git a/admin/templates/management/commands.html b/admin/templates/management/commands.html
index edf242abfdd..6b9ee927e0d 100644
--- a/admin/templates/management/commands.html
+++ b/admin/templates/management/commands.html
@@ -178,6 +178,31 @@
Remove existing orcid info from user social
+
+ migrate osf-metrics 6to8
+
+ view progress of the osf-metrics migration from elastic6 to elastic8 (or start it)
+
+
+
{% endblock %}
diff --git a/api/base/elasticsearch_dsl_views.py b/api/base/elasticsearch_dsl_views.py
index 6199fd82d0e..ecf2825d4e8 100644
--- a/api/base/elasticsearch_dsl_views.py
+++ b/api/base/elasticsearch_dsl_views.py
@@ -3,7 +3,7 @@
import datetime
import typing
-import elasticsearch_dsl as edsl
+import elasticsearch6_dsl as edsl
from rest_framework import generics, exceptions as drf_exceptions
from rest_framework.settings import api_settings as drf_settings
from api.base.settings.defaults import REPORT_FILENAME_FORMAT
@@ -23,7 +23,7 @@
class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView, abc.ABC):
- '''abstract view class using `elasticsearch_dsl.Search` as a queryset-analogue
+ '''abstract view class using `elasticsearch6_dsl.Search` as a queryset-analogue
builds a `Search` based on `self.get_default_search()` and the request's
query parameters for filtering, sorting, and pagination -- fetches only
@@ -36,7 +36,7 @@ class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView,
@abc.abstractmethod
def get_default_search(self) -> edsl.Search | None:
- '''the base `elasticsearch_dsl.Search` for this list, based on url path
+ '''the base `elasticsearch6_dsl.Search` for this list, based on url path
(common jsonapi query parameters will be considered automatically)
'''
@@ -95,7 +95,7 @@ def finalize_response(self, request, response, *args, **kwargs):
# (filtering handled in-view to reuse logic from FilterMixin)
filter_backends = ()
- # note: because elasticsearch_dsl.Search supports slicing and gives results when iterated on,
+ # note: because elasticsearch6_dsl.Search supports slicing and gives results when iterated on,
# it works fine with default pagination
# override rest_framework.generics.GenericAPIView
diff --git a/api/base/settings/defaults.py b/api/base/settings/defaults.py
index 52d30b40f9a..ac9a9739f1b 100644
--- a/api/base/settings/defaults.py
+++ b/api/base/settings/defaults.py
@@ -320,10 +320,27 @@
HASHIDS_SALT = 'pinkhimalayan'
# django-elasticsearch-metrics
-ELASTICSEARCH_DSL = {
- 'default': {
- 'hosts': osf_settings.ELASTIC6_URI,
- 'retry_on_timeout': True,
+DJELME_BACKENDS = {
+ 'osfmetrics_es6': {
+ 'elasticsearch_metrics.imps.elastic6': {
+ 'hosts': osf_settings.ELASTIC6_URI,
+ 'retry_on_timeout': True,
+ },
+ },
+ 'osfmetrics_es8': {
+ 'elasticsearch_metrics.imps.elastic8': {
+ # passthru kwargs to elasticsearch8 connection constructor
+ 'hosts': osf_settings.ELASTIC8_URI,
+ 'ca_certs': osf_settings.ELASTIC8_CERT_PATH,
+ 'basic_auth': (
+ (osf_settings.ELASTIC8_USERNAME, osf_settings.ELASTIC8_SECRET)
+ if osf_settings.ELASTIC8_SECRET is not None
+ else None
+ ),
+ 'ssl_assert_hostname': osf_settings.ELASTIC8_ASSERT_HOSTNAME,
+ # djelme-specific kwargs
+ 'djelme_default_index_name_prefix': osf_settings.SHARE_PROVIDER_PREPEND,
+ },
},
}
# Store yearly indices for time-series metrics
diff --git a/api/metrics/serializers.py b/api/metrics/serializers.py
index 5bbde293505..9e3f61f5b50 100644
--- a/api/metrics/serializers.py
+++ b/api/metrics/serializers.py
@@ -6,6 +6,10 @@
from api.base.serializers import BaseAPISerializer
from api.base.utils import absolute_reverse
from osf.metrics.counted_usage import CountedAuthUsage, PageviewInfo
+from osf.metrics.es8_metrics import (
+ OsfCountedUsageEvent,
+ PageviewInfo as PageviewInfoEs8,
+)
from website import settings as website_settings
logger = logging.getLogger(__name__)
@@ -42,7 +46,7 @@ class PageviewInfoSerializer(ser.Serializer):
class CountedAuthUsageSerializer(ser.Serializer):
- item_guid = ser.CharField(max_length=255, required=False)
+ item_guid = ser.CharField(max_length=255, required=True)
client_session_id = ser.CharField(max_length=255, required=False)
provider_id = ser.CharField(max_length=255, required=False)
@@ -64,8 +68,21 @@ def validate(self, data):
def create(self, validated_data):
pageview_info = None
+ pageview_info_es8 = None
if pageview_info_data := validated_data.get('pageview_info'):
pageview_info = PageviewInfo(**pageview_info_data)
+ pageview_info_es8 = PageviewInfoEs8(**pageview_info_data)
+ OsfCountedUsageEvent.record(
+ item_osfid=validated_data['item_guid'],
+ action_labels=validated_data.get('action_labels'),
+ provider_id=validated_data.get('provider_id'),
+ pageview_info=pageview_info_es8,
+ # used to create a COUNTER session-hour id, not stored:
+ client_session_id=validated_data.get('client_session_id'),
+ user_id=self.context.get('user_id'),
+ request_host=self.context.get('request_host'),
+ request_useragent=self.context.get('request_useragent'),
+ )
return CountedAuthUsage.record(
platform_iri=website_settings.DOMAIN,
provider_id=validated_data.get('provider_id'),
diff --git a/api/metrics/urls.py b/api/metrics/urls.py
index e135212541c..db63df3dd4c 100644
--- a/api/metrics/urls.py
+++ b/api/metrics/urls.py
@@ -5,7 +5,9 @@
app_name = 'osf'
urlpatterns = [
- re_path(r'^raw/(?P[a-z0-9._/]*)$', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name),
+ re_path(r'^raw/(?P[a-z0-9._/]*)$', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name, kwargs={'djelme_backend_name': 'osfmetrics_es6'}),
+ path('raw-/', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name, kwargs={'url_path': ''}),
+ path('raw-/', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name),
re_path(r'^preprints/views/$', views.PreprintViewMetrics.as_view(), name=views.PreprintViewMetrics.view_name),
re_path(r'^preprints/downloads/$', views.PreprintDownloadMetrics.as_view(), name=views.PreprintDownloadMetrics.view_name),
re_path(r'^registries_moderation/transitions/$', views.RegistriesModerationMetricsView.as_view(), name=views.RegistriesModerationMetricsView.view_name),
diff --git a/api/metrics/views.py b/api/metrics/views.py
index 99ecf3fe347..bd53bee296e 100644
--- a/api/metrics/views.py
+++ b/api/metrics/views.py
@@ -6,8 +6,9 @@
from django.http import JsonResponse, HttpResponse, Http404
from django.utils import timezone
-from elasticsearch.exceptions import NotFoundError, RequestError
-from elasticsearch_dsl.connections import get_connection
+from elasticsearch6.exceptions import NotFoundError, RequestError
+from elasticsearch6_dsl.connections import get_connection
+from elasticsearch_metrics.registry import djelme_registry
from framework.auth.oauth_scopes import CoreScopes
@@ -226,24 +227,49 @@ def delete(self, request, *args, **kwargs):
raise ValidationError('DELETE not supported. Use GET/POST/PUT')
@require_switch(ENABLE_RAW_METRICS)
- def get(self, request, *args, **kwargs):
- connection = get_connection()
- url_path = kwargs['url_path']
- return JsonResponse(connection.transport.perform_request('GET', f'/{url_path}'))
+ def get(self, request, *args, djelme_backend_name, url_path, **kwargs):
+ _response_body = self._do_es_request(
+ djelme_backend_name,
+ method='GET',
+ path=url_path,
+ qp=request.GET,
+ )
+ return JsonResponse(_response_body)
@require_switch(ENABLE_RAW_METRICS)
- def post(self, request, *args, **kwargs):
- connection = get_connection()
- url_path = kwargs['url_path']
- body = json.loads(request.body)
- return JsonResponse(connection.transport.perform_request('POST', f'/{url_path}', body=body))
+ def post(self, request, *args, djelme_backend_name, url_path, **kwargs):
+ _response_body = self._do_es_request(
+ djelme_backend_name,
+ method='POST',
+ path=url_path,
+ qp=request.GET,
+ body=json.loads(request.body),
+ )
+ return JsonResponse(_response_body)
@require_switch(ENABLE_RAW_METRICS)
- def put(self, request, *args, **kwargs):
- connection = get_connection()
- url_path = kwargs['url_path']
- body = json.loads(request.body)
- return JsonResponse(connection.transport.perform_request('PUT', f'/{url_path}', body=body))
+ def put(self, request, *args, djelme_backend_name, url_path, **kwargs):
+ _response_body = self._do_es_request(
+ djelme_backend_name,
+ method='PUT',
+ path=url_path,
+ qp=request.GET,
+ body=json.loads(request.body),
+ )
+ return JsonResponse(_response_body)
+
+ def _do_es_request(self, djelme_backend_name, method, path, qp, body=None):
+ _client = self._get_es_client(djelme_backend_name)
+ _perform_fn = getattr(_client, 'perform_request', None) or _client.transport.perform_request
+ _response = _perform_fn(method, f'/{path}', params=qp.dict(), body=body)
+ return _response if isinstance(_response, dict) else _response.body
+
+ def _get_es_client(self, djelme_backend_name):
+ try:
+ _backend = djelme_registry.get_backend(djelme_backend_name)
+ except LookupError:
+ raise Http404
+ return _backend.elastic_client
class RegistriesModerationMetricsView(GenericAPIView):
@@ -387,7 +413,14 @@ class CountedAuthUsageView(JSONAPIBaseView):
serializer_class = CountedAuthUsageSerializer
def post(self, request, *args, **kwargs):
- serializer = self.serializer_class(data=request.data)
+ serializer = self.serializer_class(
+ data=request.data,
+ context={
+ 'user_id': request.user._id if request.user.is_authenticated else None,
+ 'request_host': request.get_host(),
+ 'request_useragent': request.META.get('HTTP_USER_AGENT', ''),
+ },
+ )
serializer.is_valid(raise_exception=True)
if should_skip_counted_usage(
request.user,
@@ -403,6 +436,8 @@ def post(self, request, *args, **kwargs):
return HttpResponse(status=201)
def _get_session_id(self, request, client_session_id=None):
+ # NOTE: to remove after osfmetrics 6to8 migration -- logic moved to djelme
+
# get a session id as described in the COUNTER code of practice:
# https://cop5.projectcounter.org/en/5.0.2/07-processing/03-counting-unique-items.html
# -- different from the "login session" tracked by `osf.models.Session` (which
diff --git a/api_tests/institutions/views/test_institution_department_list.py b/api_tests/institutions/views/test_institution_department_list.py
index c2a5c0fcf99..8b785504756 100644
--- a/api_tests/institutions/views/test_institution_department_list.py
+++ b/api_tests/institutions/views/test_institution_department_list.py
@@ -44,7 +44,7 @@ def populate_counts(self, user, user2, user3, user4, admin, institution):
department_name='Old Department',
public_project_count=1,
private_project_count=1,
- ).save(refresh=True)
+ ).save()
_this_month = YearMonth.from_date(datetime.date.today())
@@ -56,7 +56,7 @@ def populate_counts(self, user, user2, user3, user4, admin, institution):
department_name='New Department',
public_project_count=1,
private_project_count=1,
- ).save(refresh=True)
+ ).save()
# A second user entered the department
InstitutionalUserReport(
@@ -66,7 +66,7 @@ def populate_counts(self, user, user2, user3, user4, admin, institution):
department_name='New Department',
public_project_count=1,
private_project_count=1,
- ).save(refresh=True)
+ ).save()
# A new department with a single user to test sorting
InstitutionalUserReport(
@@ -76,7 +76,7 @@ def populate_counts(self, user, user2, user3, user4, admin, institution):
department_name='Smaller Department',
public_project_count=1,
private_project_count=1,
- ).save(refresh=True)
+ ).save()
# A user with no department
InstitutionalUserReport(
@@ -85,7 +85,7 @@ def populate_counts(self, user, user2, user3, user4, admin, institution):
institution_id=institution._id,
public_project_count=1,
private_project_count=1,
- ).save(refresh=True)
+ ).save()
@pytest.fixture()
def admin(self, institution):
@@ -113,6 +113,7 @@ def test_auth(self, app, url, user, admin):
assert resp.json['data'] == []
def test_get(self, app, url, admin, institution, populate_counts):
+ InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
resp = app.get(url, auth=admin.auth)
assert resp.json['data'] == [{
diff --git a/api_tests/institutions/views/test_institution_summary_metrics.py b/api_tests/institutions/views/test_institution_summary_metrics.py
index 41983458d2e..6dd6c5bbda3 100644
--- a/api_tests/institutions/views/test_institution_summary_metrics.py
+++ b/api_tests/institutions/views/test_institution_summary_metrics.py
@@ -84,6 +84,7 @@ def test_get_empty(self, app, url, institutional_admin):
assert resp.json['meta'] == {'version': '2.0'}
def test_get_report(self, app, url, institutional_admin, institution, reports, unshown_reports):
+ InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern)
resp = app.get(url, auth=institutional_admin.auth)
assert resp.status_code == 200
@@ -149,6 +150,7 @@ def test_get_report_with_multiple_months_and_institutions(
monthly_logged_in_user_count=270,
monthly_active_user_count=260,
)
+ InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern)
resp = app.get(url, auth=institutional_admin.auth)
assert resp.status_code == 200
@@ -189,6 +191,7 @@ def test_get_with_valid_report_dates(self, app, url, institution, institutional_
institution,
user_count=4133,
)
+ InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern)
resp = app.get(f'{url}?report_yearmonth=2024-08', auth=institutional_admin.auth)
assert resp.status_code == 200
@@ -213,6 +216,7 @@ def test_get_with_invalid_report_date(self, app, url, institution, institutional
institution,
user_count=999,
)
+ InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern)
# Request with an invalid report_date format
resp = app.get(f'{url}?report_yearmonth=invalid-date', auth=institutional_admin.auth)
@@ -233,6 +237,7 @@ def test_get_without_report_date_uses_most_recent(self, app, url, institution, i
institution,
user_count=999,
)
+ InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern)
resp = app.get(url, auth=institutional_admin.auth)
assert resp.status_code == 200
@@ -247,5 +252,5 @@ def _summary_report_factory(yearmonth, institution, **kwargs):
institution_id=institution._id,
**kwargs,
)
- report.save(refresh=True)
+ report.save()
return report
diff --git a/api_tests/institutions/views/test_institution_user_metric_list.py b/api_tests/institutions/views/test_institution_user_metric_list.py
index 0826dcd0161..d2b99da435f 100644
--- a/api_tests/institutions/views/test_institution_user_metric_list.py
+++ b/api_tests/institutions/views/test_institution_user_metric_list.py
@@ -89,6 +89,7 @@ def test_get_empty(self, app, url, institutional_admin):
assert _resp.json['data'] == []
def test_get_reports(self, app, url, institutional_admin, institution, reports, unshown_reports):
+ InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
_resp = app.get(url, auth=institutional_admin.auth)
assert _resp.status_code == 200
assert len(_resp.json['data']) == len(reports)
@@ -100,6 +101,7 @@ def test_get_reports(self, app, url, institutional_admin, institution, reports,
assert len(response_object['attributes']['contacts']) == 0
def test_filter_reports(self, app, url, institutional_admin, institution, reports, unshown_reports):
+ InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
for _query, _expected_user_ids in (
({'filter[department]': 'nunavum'}, set()),
({'filter[department]': 'incidentally'}, set()),
@@ -135,6 +137,7 @@ def test_filter_reports(self, app, url, institutional_admin, institution, report
assert set(_user_ids(_resp)) == _expected_user_ids
def test_sort_reports(self, app, url, institutional_admin, institution, reports, unshown_reports):
+ InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
for _query, _expected_user_id_list in (
({'sort': 'storage_byte_count'}, ['u_sparse', 'u_orc', 'u_blargl', 'u_orcomma']),
({'sort': '-storage_byte_count'}, ['u_orcomma', 'u_blargl', 'u_orc', 'u_sparse']),
@@ -144,6 +147,7 @@ def test_sort_reports(self, app, url, institutional_admin, institution, reports,
assert list(_user_ids(_resp)) == _expected_user_id_list
def test_paginate_reports(self, app, url, institutional_admin, institution, reports, unshown_reports):
+ InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
for _query, _expected_user_id_list in (
({'sort': 'storage_byte_count', 'page[size]': 2}, ['u_sparse', 'u_orc']),
({'sort': 'storage_byte_count', 'page[size]': 2, 'page': 2}, ['u_blargl', 'u_orcomma']),
@@ -178,6 +182,7 @@ def test_get_report_formats_csv_tsv(self, app, url, institutional_admin, institu
month_last_active='2018-02',
month_last_login='2018-02',
)
+ InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth)
assert resp.status_code == 200
@@ -281,6 +286,7 @@ def test_csv_tsv_ignores_pagination(self, app, url, institutional_admin, institu
str(736662999298 + i),
f'Jalen Hurts #{i}',
])
+ InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
# Make request for CSV format with page[size]=10
resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth)
@@ -346,6 +352,7 @@ def test_get_report_format_table_json(self, app, url, institutional_admin, insti
month_last_active='2018-02',
month_last_login='2018-02',
)
+ InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
resp = app.get(f'{url}?format=json_report', auth=institutional_admin.auth)
assert resp.status_code == 200
@@ -411,6 +418,7 @@ def test_correct_number_of_contact_messages(self, app, url, institutional_admin,
department_name='a department, or so, that happens, incidentally, to have commas',
storage_byte_count=736662999298,
)
+ InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern)
receiver = user1
with capture_notifications():
@@ -477,5 +485,5 @@ def _report_factory(yearmonth, institution, **kwargs):
institution_id=institution._id,
**kwargs,
)
- _report.save(refresh=True)
+ _report.save()
return _report
diff --git a/api_tests/metrics/test_composite_query.py b/api_tests/metrics/test_composite_query.py
index 0cd0b3bb180..016677c3a11 100644
--- a/api_tests/metrics/test_composite_query.py
+++ b/api_tests/metrics/test_composite_query.py
@@ -1,4 +1,3 @@
-import time
import pytest
from datetime import datetime
from osf_tests.factories import (
@@ -75,7 +74,7 @@ def test_elasticsearch_agg_query(self, app, user, base_url, preprint):
path=preprint.primary_file.path,
timestamp=datetime(year=2020, month=2, day=1)
)
- time.sleep(1) # gives ES some time to update
+ PreprintDownload._get_connection().indices.refresh(PreprintDownload._template_pattern)
resp = app.post_json_api(post_url, payload, auth=user.auth)
assert resp.status_code == 200
diff --git a/api_tests/metrics/test_counted_usage.py b/api_tests/metrics/test_counted_usage.py
index be1d986ff6d..e954248c15b 100644
--- a/api_tests/metrics/test_counted_usage.py
+++ b/api_tests/metrics/test_counted_usage.py
@@ -16,6 +16,7 @@
)
from osf.utils.permissions import ADMIN, READ, WRITE
from api_tests.utils import create_test_file
+from elasticsearch_metrics.tests.util import djelme_test_backends
COUNTED_USAGE_URL = '/_/metrics/events/counted_usage/'
@@ -43,8 +44,9 @@ def assert_saved_with(mock_save, *, expected_doc_id=None, expected_attrs):
@pytest.fixture
def mock_save():
- with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save:
- yield mock_save
+ with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'):
+ with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save:
+ yield mock_save
@pytest.mark.django_db
@@ -73,6 +75,12 @@ def test_required_attributes(self, app, attrs):
@pytest.mark.django_db
class TestComputedFields:
+
+ @pytest.fixture(autouse=True)
+ def _real_elastic(self):
+ with djelme_test_backends():
+ yield
+
@pytest.fixture(autouse=True)
def mock_domain(self):
domain = 'http://example.foo/'
@@ -85,15 +93,22 @@ def mock_now(self):
with mock.patch('django.utils.timezone.now', return_value=timestamp):
yield timestamp
+ @pytest.fixture
+ def preprint(self, request):
+ return PreprintFactory(
+ is_public=True,
+ is_published=True,
+ )
+
@pytest.fixture()
def user(self):
with mock.patch('osf.models.base.generate_guid', return_value='guidy'):
return AuthUserFactory()
- def test_by_client_session_id(self, app, mock_save, user):
+ def test_by_client_session_id(self, app, mock_save, user, preprint):
payload = counted_usage_payload(
client_session_id='hello',
- item_guid='zyxwv',
+ item_guid=preprint._id,
action_labels=['view', 'api'],
pageview_info={'page_url': 'http://example.foo/blahblah/blee'},
)
@@ -108,7 +123,7 @@ def test_by_client_session_id(self, app, mock_save, user):
expected_doc_id='3239044c7462dd318edd0522a0ed7d84b9c6502ef16cb40dfcae6c1f456d57a2',
expected_attrs={
'platform_iri': 'http://example.foo/',
- 'item_guid': 'zyxwv',
+ 'item_guid': preprint._id,
# session_id: sha256(b'hello|1981-01-01').hexdigest()
'session_id': '5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34',
'action_labels': ['view', 'api'],
@@ -120,10 +135,10 @@ def test_by_client_session_id(self, app, mock_save, user):
},
)
- def test_by_client_session_id_anon(self, app, mock_save):
+ def test_by_client_session_id_anon(self, app, mock_save, preprint):
payload = counted_usage_payload(
client_session_id='hello',
- item_guid='zyxwv',
+ item_guid=preprint._id,
action_labels=['view', 'web'],
pageview_info={
'page_url': 'http://example.foo/bliz/',
@@ -141,7 +156,7 @@ def test_by_client_session_id_anon(self, app, mock_save):
expected_doc_id='d01759e963893f9dc9b2ccf016a5ef29135673779802b5578f31449543677e82',
expected_attrs={
'platform_iri': 'http://example.foo/',
- 'item_guid': 'zyxwv',
+ 'item_guid': preprint._id,
# session_id: sha256(b'hello|1981-01-01').hexdigest()
'session_id': '5b7c8b0a740a5b23712258a9d1164d2af008df02a8e3d339f16ead1d19595b34',
'action_labels': ['view', 'web'],
@@ -155,9 +170,9 @@ def test_by_client_session_id_anon(self, app, mock_save):
},
)
- def test_by_user_auth(self, app, mock_save, user):
+ def test_by_user_auth(self, app, mock_save, user, preprint):
payload = counted_usage_payload(
- item_guid='yxwvu',
+ item_guid=preprint._id,
action_labels=['view', 'web'],
pageview_info={
'page_url': 'http://osf.io/mst3k',
@@ -175,7 +190,7 @@ def test_by_user_auth(self, app, mock_save, user):
expected_doc_id='7b8bc27c6d90fb45aa5bbd02deceba9f7384ed61b9a6e7253317c262020b94c2',
expected_attrs={
'platform_iri': 'http://example.foo/',
- 'item_guid': 'yxwvu',
+ 'item_guid': preprint._id,
# session_id: sha256(b'guidy|1981-01-01|0').hexdigest()
'session_id': 'ec768abb16c3411570af99b9d635c2c32d1ca31d1b25eec8ee73759e7242e74a',
'action_labels': ['view', 'web'],
@@ -189,10 +204,14 @@ def test_by_user_auth(self, app, mock_save, user):
},
)
- def test_by_useragent_header(self, app, mock_save):
+ def test_by_useragent_header(self, app, mock_save, preprint):
payload = counted_usage_payload(
- item_guid='yxwvu',
+ item_guid=preprint._id,
action_labels=['view', 'api'],
+ pageview_info={
+ 'page_url': 'http://example.foo/bliz/',
+ 'referer_url': 'http://elsewhere.baz/index.php',
+ },
)
headers = {
'User-Agent': 'haha',
@@ -202,14 +221,20 @@ def test_by_useragent_header(self, app, mock_save):
assert_saved_with(
mock_save,
# doc_id: sha256(b'http://example.foo/|yxwvu|97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a|1981-01-01|3|api,view').hexdigest()
- expected_doc_id='d669528b30f443ffe506e183537af9624ef290090e90a200ecce7b7ca19c77f7',
+ expected_doc_id='6d7549df6734bb955eb832c6316ffae46c2959c95b5817ab4fcb341dbc875c23',
expected_attrs={
'platform_iri': 'http://example.foo/',
- 'item_guid': 'yxwvu',
+ 'item_guid': preprint._id,
# session_id: sha256(b'localhost:80|haha|1981-01-01|0').hexdigest()
'session_id': '97098dd3f7cd26053c0d0264d1c84eaeea8e08d2c55ca34017ffbe53c749ba5a',
'action_labels': ['view', 'api'],
- 'pageview_info': None,
+ 'pageview_info': {
+ 'page_url': 'http://example.foo/bliz/',
+ 'page_path': '/bliz',
+ 'referer_url': 'http://elsewhere.baz/index.php',
+ 'referer_domain': 'elsewhere.baz',
+ 'hour_of_day': 0,
+ },
},
)
@@ -217,6 +242,12 @@ def test_by_useragent_header(self, app, mock_save):
@pytest.mark.parametrize('item_public', [True, False])
@pytest.mark.django_db
class TestGuidFields:
+
+ @pytest.fixture(autouse=True)
+ def _real_elastic(self):
+ with djelme_test_backends():
+ yield
+
@pytest.fixture
def preprint(self, item_public):
return PreprintFactory(
@@ -261,7 +292,7 @@ def test_preprint_file(self, app, mock_save, preprint, item_public):
item_guid=preprint._id,
action_labels=['view', 'web'],
)
- resp = app.post_json_api(COUNTED_USAGE_URL, payload)
+ resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'})
assert resp.status_code == 201
assert_saved_with(
mock_save,
@@ -280,7 +311,7 @@ def test_preprint_file(self, app, mock_save, preprint, item_public):
item_guid=preprint.primary_file.get_guid(create=True)._id,
action_labels=['view', 'web'],
)
- resp = app.post_json_api(COUNTED_USAGE_URL, payload)
+ resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'})
assert resp.status_code == 201
assert_saved_with(
mock_save,
@@ -299,7 +330,7 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil
item_guid=child_reg_file_guid,
action_labels=['view', 'web'],
)
- resp = app.post_json_api(COUNTED_USAGE_URL, payload)
+ resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'})
assert resp.status_code == 201
assert_saved_with(
mock_save,
@@ -322,7 +353,7 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil
item_guid=child_reg._id,
action_labels=['view', 'web'],
)
- resp = app.post_json_api(COUNTED_USAGE_URL, payload)
+ resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'})
assert resp.status_code == 201
assert_saved_with(
mock_save,
@@ -344,7 +375,7 @@ def test_child_registration_file(self, app, mock_save, child_reg_file_guid, chil
item_guid=parent_reg._id,
action_labels=['view', 'web'],
)
- resp = app.post_json_api(COUNTED_USAGE_URL, payload)
+ resp = app.post_json_api(COUNTED_USAGE_URL, payload, headers={'User-Agent': 'blarg'})
assert resp.status_code == 201
assert_saved_with(
mock_save,
diff --git a/api_tests/metrics/test_preprint_metrics.py b/api_tests/metrics/test_preprint_metrics.py
index 1bde8719b75..cd9b8041c2d 100644
--- a/api_tests/metrics/test_preprint_metrics.py
+++ b/api_tests/metrics/test_preprint_metrics.py
@@ -8,7 +8,7 @@
from django.utils import timezone
from waffle.testutils import override_switch
-from elasticsearch.exceptions import RequestError
+from elasticsearch6.exceptions import RequestError
from osf import features
from api.base.settings import API_PRIVATE_BASE as API_BASE
diff --git a/api_tests/metrics/test_raw_metrics.py b/api_tests/metrics/test_raw_metrics.py
index 6a3b9b8f8c5..e32936d9024 100644
--- a/api_tests/metrics/test_raw_metrics.py
+++ b/api_tests/metrics/test_raw_metrics.py
@@ -1,10 +1,10 @@
import pytest
-import time
from website.app import setup_django
setup_django()
from waffle.testutils import override_switch
+from elasticsearch6_dsl.connections import connections as es6_connections
from osf import features
from osf_tests.factories import AuthUserFactory
@@ -40,9 +40,9 @@ def user(self):
def other_user(self):
return AuthUserFactory()
- @pytest.fixture
- def base_url(self):
- return f'/{API_BASE}metrics/raw/'
+ @pytest.fixture(params=['raw', 'raw-osfmetrics_es6'])
+ def base_url(self, request):
+ return f'/{API_BASE}metrics/{request.param}/'
def test_delete(self, app, user, base_url):
res = app.delete_json_api(base_url, auth=user.auth, expect_errors=True)
@@ -136,7 +136,9 @@ def test_post_and_get(self, app, user, base_url):
res = app.post_json_api(post_url, post_data, auth=user.auth)
assert res.json == post_return
- time.sleep(3)
+ es6_connections.get_connection('osfmetrics_es6').indices.refresh(
+ index='customer',
+ )
get_url = f'{base_url}customer/_search?q=*'
res = app.get(get_url, auth=user.auth)
diff --git a/api_tests/metrics/test_registries_moderation_metrics.py b/api_tests/metrics/test_registries_moderation_metrics.py
index 93469b1b3b5..f5d3a047b10 100644
--- a/api_tests/metrics/test_registries_moderation_metrics.py
+++ b/api_tests/metrics/test_registries_moderation_metrics.py
@@ -1,8 +1,5 @@
import pytest
-from waffle.testutils import override_switch
-import time
-from osf import features
from osf_tests.factories import RegistrationFactory, AuthUserFactory
from osf.utils.workflows import RegistrationModerationStates, RegistrationModerationTriggers
from osf.metrics import RegistriesModerationMetrics
@@ -18,11 +15,6 @@ class TestRegistrationModerationMetrics:
def registration(self):
return RegistrationFactory()
- @pytest.fixture(autouse=True)
- def enable_elasticsearch_metrics(self):
- with override_switch(features.ELASTICSEARCH_METRICS, active=True):
- yield
-
@pytest.mark.es_metrics
def test_record_transitions(self, registration):
with capture_notifications():
@@ -32,7 +24,7 @@ def test_record_transitions(self, registration):
registration.creator,
'Metrics is easy'
)
- time.sleep(1)
+ RegistriesModerationMetrics._get_connection().indices.refresh(RegistriesModerationMetrics._template_pattern)
assert RegistriesModerationMetrics.search().count() == 1
data = RegistriesModerationMetrics.search().execute()['hits']['hits'][0]['_source']
@@ -51,11 +43,6 @@ class TestRegistrationModerationMetricsView:
def registration(self):
return RegistrationFactory()
- @pytest.fixture(autouse=True)
- def enable_elasticsearch_metrics(self):
- with override_switch(features.ELASTICSEARCH_METRICS, active=True):
- yield
-
@pytest.fixture
def user(self):
user = AuthUserFactory()
@@ -81,7 +68,7 @@ def test_registries_moderation_view(self, app, user, base_url, registration):
registration.creator,
'Metrics is easy'
)
- time.sleep(1)
+ RegistriesModerationMetrics._get_connection().indices.refresh(RegistriesModerationMetrics._template_pattern)
res = app.get(base_url, auth=user.auth, expect_errors=True)
data = res.json
diff --git a/conftest.py b/conftest.py
index 9494e3d296e..e80c4e5c566 100644
--- a/conftest.py
+++ b/conftest.py
@@ -1,24 +1,24 @@
-import contextlib
from unittest import mock
import logging
import os
import re
-from django.core.management import call_command
from django.db import transaction
-from elasticsearch import exceptions as es_exceptions
-from elasticsearch_dsl.connections import connections
-from elasticsearch_metrics.registry import registry as es_metrics_registry
+from elasticsearch6_dsl.connections import connections
+from elasticsearch_metrics.tests.util import djelme_test_backends
from faker import Factory
import pytest
import responses
import xml.etree.ElementTree as ET
+from waffle.testutils import override_switch
from api_tests.share import _utils as shtrove_test_utils
from framework.celery_tasks import app as celery_app
from osf.external.spam import tasks as spam_tasks
from website import settings as website_settings
from osf.management.commands.populate_notification_types import populate_notification_types
+from osf import features
+
def pytest_configure(config):
if not os.getenv('GITHUB_ACTIONS') == 'true':
@@ -43,6 +43,8 @@ def pytest_configure(config):
'transitions.core',
'MARKDOWN',
'elasticsearch',
+ 'elastic_transport',
+ 'elasticsearch_metrics',
]
for logger_name in SILENT_LOGGERS:
logging.getLogger(logger_name).setLevel(logging.CRITICAL)
@@ -138,45 +140,20 @@ def es6_client(setup_connections):
@pytest.fixture(scope='function', autouse=True)
-def _es_metrics_marker(request, worker_id):
+def _es_metrics_marker(request):
"""Clear out all indices and index templates before and after
tests marked with `es_metrics`.
"""
marker = request.node.get_closest_marker('es_metrics')
- if marker:
- es6_client = request.getfixturevalue('es6_client')
- _temp_prefix = 'temp_metrics_'
- _temp_wildcard = f'{_temp_prefix}-{worker_id}*'
-
- def _teardown_es_temps():
- es6_client.indices.delete(index=_temp_wildcard)
- try:
- es6_client.indices.delete_template(_temp_wildcard)
- except es_exceptions.NotFoundError:
- pass
-
- @contextlib.contextmanager
- def _mock_metric_names():
- with contextlib.ExitStack() as _exit:
- for _metric_class in es_metrics_registry.get_metrics():
- _exit.enter_context(mock.patch.object(
- _metric_class,
- '_template_name', # also used to construct index names
- f'{_temp_prefix}-{worker_id}{_metric_class._template_name}',
- ))
- _exit.enter_context(mock.patch.object(
- _metric_class,
- '_template', # a wildcard string for indexes and templates
- f'{_temp_prefix}-{worker_id}{_metric_class._template}',
- ))
- yield
-
- _teardown_es_temps()
- with _mock_metric_names():
- call_command('sync_metrics')
- yield
- _teardown_es_temps()
- else:
+
+ if not marker:
+ yield
+ return
+
+ with (
+ override_switch(features.ELASTICSEARCH_METRICS, active=True),
+ djelme_test_backends(),
+ ):
yield
diff --git a/docker-compose.yml b/docker-compose.yml
index 9914c24728b..42f7efc5ce7 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -13,6 +13,8 @@ volumes:
external: false
elasticsearch6_data_vol:
external: false
+ elasticsearch8_data_vol:
+ external: false
rabbitmq_vol:
external: false
preprints_dist_vol:
@@ -68,10 +70,35 @@ services:
# Temporary: Remove when we've upgraded to ES6
elasticsearch6:
image: docker.elastic.co/elasticsearch/elasticsearch:6.3.1
+ environment:
+ - ES_JAVA_OPTS=-Xms512m -Xmx512m # reduce memory usage
ports:
- 9201:9200
volumes:
- elasticsearch6_data_vol:/usr/share/elasticsearch/data
+ healthcheck:
+ start_period: 15s
+ test: curl -s http://localhost:9200/_cluster/health | grep -vq '"status":"red"'
+ interval: 10s
+ retries: 30
+ stdin_open: true
+
+ elasticsearch8:
+ image: elasticsearch:8.19.14
+ environment:
+ - discovery.type=single-node
+ - xpack.security.enabled=false
+ - ES_JAVA_OPTS=-Xms512m -Xmx512m # reduce memory usage
+ - xpack.ml.enabled=false
+ ports:
+ - 9202:9200
+ volumes:
+ - elasticsearch8_data_vol:/usr/share/elasticsearch/data
+ healthcheck:
+ start_period: 15s
+ test: curl -s http://localhost:9200/_cluster/health | grep -vq '"status":"red"'
+ interval: 10s
+ retries: 30
stdin_open: true
postgres:
diff --git a/framework/celery_tasks/routers.py b/framework/celery_tasks/routers.py
index c33238780e8..d9d6e335286 100644
--- a/framework/celery_tasks/routers.py
+++ b/framework/celery_tasks/routers.py
@@ -11,6 +11,8 @@ def match_by_module(task_path):
return CeleryConfig.task_med_queue
if task_subpath in CeleryConfig.high_pri_modules:
return CeleryConfig.task_high_queue
+ if task_subpath in CeleryConfig.background_migration_modules:
+ return CeleryConfig.task_background_migration_queue
if task_subpath in CeleryConfig.remote_computing_modules:
return CeleryConfig.task_remote_computing_queue
if task_subpath in CeleryConfig.account_status_changes_modules:
diff --git a/osf/management/commands/check_deploy_ready.py b/osf/management/commands/check_deploy_ready.py
index 39fc98e9bee..527c900f273 100644
--- a/osf/management/commands/check_deploy_ready.py
+++ b/osf/management/commands/check_deploy_ready.py
@@ -18,6 +18,6 @@ def handle(self, *args, **options):
]
if waffle.switch_is_active(features.ELASTICSEARCH_METRICS):
- CHECKS.append(['check_metrics'])
+ CHECKS.append(['djelme_backend_check'])
for check in CHECKS:
call_command(*check)
diff --git a/osf/management/commands/fake_metrics_reports.py b/osf/management/commands/fake_metrics_reports.py
index 765d6e475c1..53e13472e74 100644
--- a/osf/management/commands/fake_metrics_reports.py
+++ b/osf/management/commands/fake_metrics_reports.py
@@ -8,6 +8,8 @@
UserSummaryReport,
PreprintSummaryReport,
)
+from osf.metrics.reports import PublicItemUsageReport
+from osf.metrics.utils import YearMonth
from osf.models import PreprintProvider
@@ -53,10 +55,27 @@ def fake_preprint_counts(days_back):
).save()
+def fake_usage_reports(osfid: str, count: int):
+ _ym = YearMonth.from_date(date.today()).prior()
+ for _months in range(count):
+ PublicItemUsageReport.record(
+ item_osfid=osfid,
+ report_yearmonth=_ym,
+ view_count=(_vc := randint(0, 500)),
+ view_session_count=randint(0, _vc),
+ download_count=(_dc := randint(0, 300)),
+ download_session_count=randint(0, _dc),
+ )
+ _ym = _ym.prior()
+
+
class Command(BaseCommand):
def handle(self, *args, **kwargs):
if not settings.DEBUG:
raise NotImplementedError('fake_reports requires DEBUG mode')
fake_user_counts(1000)
fake_preprint_counts(1000)
+ fake_usage_reports('blarg', 100)
+ fake_usage_reports('blerg', 50)
+ fake_usage_reports('bleg', 50)
# TODO: more reports
diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py
new file mode 100644
index 00000000000..0c71d7b2307
--- /dev/null
+++ b/osf/management/commands/migrate_osfmetrics_6to8.py
@@ -0,0 +1,803 @@
+import collections
+import datetime
+import functools
+import logging
+
+from django.apps import apps
+from django.core.management import call_command
+from django.core.management.base import BaseCommand
+from django.db import OperationalError as DjangoOperationalError
+from elasticsearch6.exceptions import ConnectionError as Elastic6ConnectionError
+from elasticsearch6 import helpers as es6_helpers
+from elasticsearch6_dsl.connections import connections as es6_connections
+from elasticsearch8.exceptions import TransportError as Elastic8TransportError
+from elasticsearch8.helpers import BulkIndexError as Elastic8BulkIndexError
+from elasticsearch_metrics.registry import djelme_registry
+from elasticsearch_metrics.imps import elastic8 as djel8me
+from psycopg2 import OperationalError as PostgresOperationalError
+
+from framework.celery_tasks import app as celery_app
+from osf.metadata.rdfutils import OSF
+from osf.metadata.osfmap_utils import osfmap_type_from_model, osf_iri, is_osf_component
+from osf.metrics.preprint_metrics import (
+ PreprintView,
+ PreprintDownload,
+)
+from osf.metrics.counted_usage import CountedAuthUsage as CountedUsageEs6
+from osf.metrics import reports as es6_reports
+from osf.metrics import es8_metrics, RegistriesModerationMetrics
+from osf.metrics.reporters.public_item_usage import _iter_composite_bucket_keys
+from osf.metrics.utils import YearMonth
+from osf import models as osfdb
+from website import settings as website_settings
+
+
+_logger = logging.getLogger(__name__)
+
+###
+# constants
+
+_USAGE_DAYS_BACK = 99
+
+_MAX_CARDINALITY_PRECISION = 40000 # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html#_precision_control
+
+_UNCHANGED_RECORDTYPES = {
+ # reports
+ es6_reports.StorageAddonUsage: es8_metrics.DailyStorageAddonUsageReportEs8,
+ es6_reports.DownloadCountReport: es8_metrics.DailyDownloadCountReportEs8,
+ es6_reports.InstitutionSummaryReport: es8_metrics.DailyInstitutionSummaryReportEs8,
+ es6_reports.NewUserDomainReport: es8_metrics.DailyNewUserDomainReportEs8,
+ es6_reports.NodeSummaryReport: es8_metrics.DailyNodeSummaryReportEs8,
+ es6_reports.OsfstorageFileCountReport: es8_metrics.DailyOsfstorageFileCountReportEs8,
+ es6_reports.PreprintSummaryReport: es8_metrics.DailyPreprintSummaryReportEs8,
+ es6_reports.UserSummaryReport: es8_metrics.DailyUserSummaryReportEs8,
+ es6_reports.SpamSummaryReport: es8_metrics.MonthlySpamSummaryReportEs8,
+ es6_reports.InstitutionalUserReport: es8_metrics.MonthlyInstitutionalUserReportEs8,
+ es6_reports.InstitutionMonthlySummaryReport: es8_metrics.MonthlyInstitutionSummaryReportEs8,
+ es6_reports.PrivateSpamMetricsReport: es8_metrics.MonthlyPrivateSpamMetricsReportEs8,
+ # events
+ RegistriesModerationMetrics: es8_metrics.RegistriesModerationEventEs8,
+}
+
+_TASK_KWARGS = dict(
+ autoretry_for=(
+ DjangoOperationalError,
+ Elastic6ConnectionError,
+ Elastic8TransportError,
+ PostgresOperationalError,
+ ),
+ retry_backoff=True, # exponential backoff, with jitter
+ max_retries=20,
+)
+
+###
+# celery tasks
+
+
+@celery_app.task(**_TASK_KWARGS)
+def migrate_unchanged_recordtype(es6_recordtype_name: str, until_when: str):
+ _es6_recordtype = djelme_registry.get_recordtype('osf', es6_recordtype_name)
+ _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype]
+ _convert_kwargs = (
+ _convert_unchanged_cyclicrecord_kwargs
+ if issubclass(_es8_recordtype, djel8me.CyclicRecord)
+ else (lambda _kw: _kw) # no conversion needed for event record
+ )
+ _each_new = (
+ _es8_recordtype(**_convert_kwargs(_hit['_source']))
+ for _hit in _es6_scan_range(_es6_recordtype, until_when=until_when)
+ )
+ _es8_bulk_save(_es8_recordtype, _each_new)
+
+
+@celery_app.task(**_TASK_KWARGS)
+def migrate_counted_usages(from_when: str, until_when: str):
+ # CountedAuthUsage => OsfCountedUsageEvent
+ _each_new = (
+ _convert_counted_usage(_hit['_source'])
+ for _hit in _es6_scan_range(
+ CountedUsageEs6,
+ from_when=from_when,
+ until_when=until_when,
+ addl_filter={'exists': {'field': 'item_guid'}},
+ )
+ )
+ _es8_bulk_save(es8_metrics.OsfCountedUsageEvent, _each_new)
+
+
+@celery_app.task(**_TASK_KWARGS)
+def migrate_preprint_views(from_when: str, until_when: str):
+ # PreprintView => OsfCountedUsageEvent
+ _action_labels = ['view', 'web']
+ _each_new = (
+ _convert_preprint_metric(_hit, _action_labels)
+ for _hit in _es6_scan_range(
+ PreprintView, from_when=from_when, until_when=until_when
+ )
+ )
+ _es8_bulk_save(es8_metrics.OsfCountedUsageEvent, _each_new)
+
+
+@celery_app.task(**_TASK_KWARGS)
+def migrate_preprint_downloads(from_when: str, until_when: str):
+ # PreprintDownload => OsfCountedUsageEvent
+ _action_labels = ['download']
+ _each_new = (
+ _convert_preprint_metric(_hit, _action_labels)
+ for _hit in _es6_scan_range(
+ PreprintDownload, from_when=from_when, until_when=until_when
+ )
+ )
+ _es8_bulk_save(es8_metrics.OsfCountedUsageEvent, _each_new)
+
+
+@celery_app.task(**_TASK_KWARGS)
+def migrate_usage_reports(osfid: str, until_when: str):
+ # from PublicItemUsageReport to MonthlyPublicItemUsageReportEs8
+ _osfguid = osfdb.Guid.load(osfid)
+ _item_is_component = is_osf_component(_osfguid.referent) if _osfguid else False
+
+ def _each_new():
+ # go in sorted order to build cumulative counts
+ # (only a few dozen of these per item; should be fine to sort and load all at once)
+ _each_hit = _es6_scan_range(
+ es6_reports.PublicItemUsageReport,
+ until_when=until_when,
+ addl_filter={'term': {'item_osfid': osfid}},
+ sort='report_yearmonth',
+ )
+ _prior_report = None
+ for _hit in list(_each_hit):
+ yield (
+ _prior_report := _convert_public_usage_report(
+ _hit['_source'],
+ _prior_report,
+ item_is_component=_item_is_component,
+ )
+ )
+
+ _es8_bulk_save(es8_metrics.MonthlyPublicItemUsageReportEs8, _each_new())
+
+
+###
+# various helper functions
+
+
+def _es6_connection():
+ return es6_connections.get_connection('osfmetrics_es6')
+
+
+def _es8_bulk_save(es8_recordtype, each_new_record):
+ try:
+ es8_recordtype.bulk(each_new_record, stats_only=True)
+ except Elastic8BulkIndexError as _bulk_error:
+ # so actual errors show in celery task result
+ raise Exception(_bulk_error.errors) from _bulk_error
+
+
+def _date_range(
+ range_start: datetime.date,
+ range_end: datetime.date,
+ step: datetime.timedelta = datetime.timedelta(days=1),
+) -> collections.abc.Iterator[tuple[datetime.date, datetime.date]]:
+ _from_date = range_start
+ _until_date = range_start + step
+ while _from_date < range_end:
+ yield (_from_date, _until_date)
+ (_from_date, _until_date) = (_until_date, _until_date + step)
+
+
+def _es6_scan_range(
+ es6_recordtype,
+ *,
+ from_when: str = '',
+ until_when: str,
+ addl_filter=None,
+ sort=None,
+):
+ _timestamp_range = {'lt': until_when}
+ if from_when:
+ _timestamp_range['gte'] = from_when
+ _filters = [
+ {'range': {'timestamp': _timestamp_range}},
+ ]
+ if addl_filter:
+ _filters.append(addl_filter)
+ _query_body = {'query': {'bool': {'filter': _filters}}}
+ if sort:
+ _query_body['sort'] = sort
+ return es6_helpers.scan(
+ _es6_connection(),
+ index=es6_recordtype._template_pattern,
+ query=_query_body,
+ )
+
+
+def _es6_usage_report_counts() -> tuple[int, int]:
+ _search = es6_reports.PublicItemUsageReport.search()
+ _search.aggs.metric(
+ 'agg_item_count',
+ 'cardinality',
+ field='item_osfid',
+ precision_threshold=_MAX_CARDINALITY_PRECISION,
+ )
+ _response = _search.execute()
+ _total_count = _response.hits.total
+ _item_count = (
+ _response.aggregations.agg_item_count.value
+ if 'agg_item_count' in _response.aggregations
+ else 0
+ )
+ return (_total_count, _item_count)
+
+
+def _es8_usage_report_counts() -> tuple[int, int]:
+ _search = es8_metrics.MonthlyPublicItemUsageReportEs8.search()
+ _search.aggs.metric(
+ 'agg_item_count',
+ 'cardinality',
+ field='item_osfid',
+ precision_threshold=_MAX_CARDINALITY_PRECISION,
+ )
+ _response = _search.execute()
+ _total_count = _response.hits.total.value
+ _item_count = (
+ _response.aggregations.agg_item_count.value
+ if 'agg_item_count' in _response.aggregations
+ else 0
+ )
+ return (_total_count, _item_count)
+
+
+def _get_es6_field_names(es6_recordtype):
+ '''
+ adapted from DocumentBase._get_field_names in elasticsearch8.dsl
+ '''
+ for _field_name in es6_recordtype._doc_type.mapping:
+ _field = es6_recordtype._doc_type.mapping[_field_name]
+ if hasattr(_field, '_doc_class'):
+ for _sub_field in _get_es6_field_names(_field._doc_class):
+ yield f'{_field_name}.{_sub_field}'
+ else:
+ yield _field_name
+
+
+def _assert_field_unchangedness(es6_recordtype, es8_recordtype):
+ _es6_fields = set(_get_es6_field_names(es6_recordtype))
+ _es8_fields = set(es8_recordtype._get_field_names())
+
+ # remove fields intentionally removed in migration
+ if issubclass(es6_recordtype, es6_reports.DailyReport):
+ assert issubclass(es8_recordtype, djel8me.CyclicRecord)
+ _es6_fields.remove('timestamp')
+ _es6_fields.remove('report_date')
+ elif issubclass(es6_recordtype, es6_reports.MonthlyReport):
+ assert issubclass(es8_recordtype, djel8me.CyclicRecord)
+ _es6_fields.remove('timestamp')
+ _es6_fields.remove('report_yearmonth')
+ else:
+ assert issubclass(es8_recordtype, djel8me.EventRecord)
+
+ # remove fields intentionally added in migration
+ _es8_fields.remove('timeseries_timeparts')
+ if issubclass(es8_recordtype, djel8me.CyclicRecord):
+ _es8_fields.remove('created')
+ _es8_fields.remove('cycle_coverage')
+
+ # all remaining fields should match
+ assert _es6_fields == _es8_fields
+
+
+def _semverish_from_yearmonth(given_yearmonth: str):
+ _ym = YearMonth.from_str(given_yearmonth)
+ return f'{_ym.year}.{_ym.month}'
+
+
+def _semverish_from_date(given_date: str):
+ _d = datetime.date.fromisoformat(given_date)
+ return f'{_d.year}.{_d.month}.{_d.day}'
+
+
+def _convert_unchanged_cyclicrecord_kwargs(es6_source: dict) -> dict:
+ def _each_kwarg():
+ for _key, _val in es6_source.items():
+ if _key == 'report_yearmonth':
+ # report_yearmonth converts to cycle_coverage Y.M
+ yield ('cycle_coverage', _semverish_from_yearmonth(_val))
+ elif _key == 'report_date':
+ # report_date converts to cycle_coverage Y.M.D
+ yield ('cycle_coverage', _semverish_from_date(_val))
+ elif _key != 'timestamp':
+ # skipping timestamp; on daily/monthly reports just copied from yearmonth/date
+ yield (_key, _val)
+
+ return dict(_each_kwarg())
+
+
+def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageEvent:
+ return es8_metrics.OsfCountedUsageEvent(
+ # fields from djelme.CountedUsageRecord:
+ timestamp=source['timestamp'],
+ sessionhour_id=source['session_id'],
+ platform_iri=source.get('platform_iri') or website_settings.DOMAIN,
+ database_iri=_convert_database_iri(
+ provider_id=source.get('provider_id'),
+ osf_model_name=source.get('item_type'),
+ ),
+ within_iris=[
+ osf_iri(_within_osfid)
+ for _within_osfid in source.get('surrounding_guids', ())
+ ],
+ # fields from OsfCountedUsageEvent:
+ item_osfid=source['item_guid'],
+ item_type=_convert_item_type(
+ source.get('item_type'),
+ has_surrounding_items=bool(source.get('surrounding_guids')),
+ ),
+ item_public=source.get('item_public', True),
+ provider_id=source.get('provider_id', 'osf'),
+ user_is_authenticated=source.get('user_is_authenticated', False),
+ action_labels=source.get('action_labels'),
+ pageview_info=source.get('pageview_info'),
+ )
+
+
+def _convert_preprint_metric(
+ hit: dict, action_labels: list[str]
+) -> es8_metrics.OsfCountedUsageEvent:
+ _source = hit['_source']
+ _doc_id = hit['_id']
+ return es8_metrics.OsfCountedUsageEvent.record(
+ using=False, # don't save yet; will save in bulk
+ # fields used to compute a sessionhour_id:
+ timestamp=datetime.datetime.fromisoformat(_source['timestamp']),
+ user_id=_source.get('user_id'),
+ client_session_id=_doc_id, # unique session per event (best can do)
+ # fields from djelme.CountedUsageRecord:
+ platform_iri=website_settings.DOMAIN,
+ database_iri=_convert_database_iri(
+ provider_id=_source.get('provider_id'),
+ osf_model_name='preprint',
+ ),
+ # fields from OsfCountedUsageEvent:
+ item_osfid=_source['preprint_id'],
+ item_type=OSF.Preprint,
+ item_public=True,
+ provider_id=_source.get('provider_id'),
+ user_is_authenticated=bool(_source.get('user_id')),
+ action_labels=action_labels,
+ )
+
+
+def _convert_public_usage_report(
+ source: dict,
+ prior_report: es8_metrics.MonthlyPublicItemUsageReportEs8 | None,
+ item_is_component: bool,
+) -> es8_metrics.MonthlyPublicItemUsageReportEs8:
+ if prior_report is None:
+ _c_views, _c_view_sess, _c_downloads, _c_download_sess = _get_cumulative_usage(
+ osfid=source['item_osfid'],
+ until_when=YearMonth.from_str(source['report_yearmonth']).month_end(),
+ is_preprint=(source.get('item_type') == 'preprint'),
+ )
+ else:
+ _c_views = prior_report.cumulative_view_count + source.get('view_count', 0)
+ _c_view_sess = prior_report.cumulative_view_session_count + source.get(
+ 'view_session_count', 0
+ )
+ _c_downloads = prior_report.cumulative_download_count + source.get(
+ 'download_count', 0
+ )
+ _c_download_sess = prior_report.cumulative_download_session_count + source.get(
+ 'download_session_count', 0
+ )
+ return es8_metrics.MonthlyPublicItemUsageReportEs8(
+ cycle_coverage=_semverish_from_yearmonth(source['report_yearmonth']),
+ item_osfid=source['item_osfid'],
+ item_type=_convert_item_type(
+ source.get('item_type'),
+ has_surrounding_items=item_is_component,
+ ),
+ provider_id=source.get('provider_id'),
+ platform_iri=source.get('platform_iri') or website_settings.DOMAIN,
+ view_count=source.get('view_count', 0),
+ view_session_count=source.get('view_session_count', 0),
+ cumulative_view_count=_c_views,
+ cumulative_view_session_count=_c_view_sess,
+ download_count=source.get('download_count', 0),
+ download_session_count=source.get('download_session_count', 0),
+ cumulative_download_count=_c_downloads,
+ cumulative_download_session_count=_c_download_sess,
+ )
+
+
+def _get_cumulative_usage(osfid: str, until_when, *, is_preprint: bool):
+ if is_preprint:
+ _views = _cumulative_preprint_count(PreprintView, osfid, until_when)
+ _downloads = _cumulative_preprint_count(PreprintDownload, osfid, until_when)
+ _view_sess, _download_sess = 0, 0 # no session info on preprints (yet)
+ else:
+ _views, _view_sess = _cumulative_countedusage_views(osfid, until_when)
+ _downloads, _download_sess = _cumulative_countedusage_downloads(
+ osfid, until_when
+ )
+ return (_views, _view_sess, _downloads, _download_sess)
+
+
+def _cumulative_countedusage_views(osfid: str, until_when: str) -> tuple[int, int]:
+ '''compute view_session_count separately to avoid double-counting
+
+ (the same session may be represented in both the composite agg on `item_guid`
+ and that on `surrounding_guids`)
+ '''
+ # copied/adapted from osf.metrics.reporters.public_item_usage
+ _search = (
+ CountedUsageEs6.search()
+ .filter('term', item_public=True)
+ .filter('range', timestamp={'lt': until_when})
+ .filter('term', action_labels='view')
+ .filter(
+ 'bool',
+ should=[
+ {'term': {'item_guid': osfid}},
+ {'term': {'surrounding_guids': osfid}},
+ ],
+ minimum_should_match=1,
+ )
+ .extra(size=0) # only aggregations, no hits
+ )
+ _search.aggs.metric(
+ 'agg_session_count',
+ 'cardinality',
+ field='session_id',
+ precision_threshold=_MAX_CARDINALITY_PRECISION,
+ )
+ _response = _search.execute()
+ _view_count = _response.hits.total
+ _view_session_count = (
+ _response.aggregations.agg_session_count.value
+ if 'agg_session_count' in _response.aggregations
+ else 0
+ )
+ return (_view_count, _view_session_count)
+
+
+def _cumulative_countedusage_downloads(osfid, until_when) -> tuple[int, int]:
+ '''aggregate downloads on each osfid (not including components/files)'''
+ # copied/adapted from osf.metrics.reporters.public_item_usage
+ _search = (
+ CountedUsageEs6.search()
+ .filter('term', item_public=True)
+ .filter('range', timestamp={'lt': until_when})
+ .filter('term', action_labels='download')
+ .filter('term', item_guid=osfid)
+ )
+ _search.aggs.metric(
+ 'agg_session_count',
+ 'cardinality',
+ field='session_id',
+ precision_threshold=_MAX_CARDINALITY_PRECISION,
+ )
+ _response = _search.execute()
+ _download_count = _response.hits.total
+ _download_session_count = (
+ _response.aggregations.agg_session_count.value
+ if 'agg_session_count' in _response.aggregations
+ else 0
+ )
+ return (_download_count, _download_session_count)
+
+
+def _cumulative_preprint_count(preprint_metric_cls, osfid: str, until_when: str) -> int:
+ '''aggregate views on each preprint'''
+ # copied/adapted from osf.metrics.preprint_metrics
+ _search = (
+ preprint_metric_cls.search()
+ .filter('term', preprint_id=osfid)
+ .filter('range', timestamp={'lt': until_when})
+ .extra(size=0) # no hits; only aggs
+ )
+ _search.aggs.metric('agg_count', 'sum', field='count')
+ _response = _search.execute()
+ _view_count = (
+ int(_response.aggregations.agg_count.value)
+ if hasattr(_response.aggregations, 'agg_count')
+ else 0
+ )
+ return _view_count
+
+
+def _convert_item_type(osf_model_name: str | list[str] | None, has_surrounding_items: bool):
+ if isinstance(osf_model_name, list):
+ osf_model_name = osf_model_name[0] if osf_model_name else None
+ if osf_model_name:
+ try:
+ return osfmap_type_from_model(
+ apps.get_model('osf', osf_model_name),
+ is_component=has_surrounding_items,
+ )
+ except LookupError:
+ pass
+ return OSF.Object # fine, fallback to abstract type
+
+
+def _convert_database_iri(provider_id: str | None, osf_model_name: str) -> str:
+ if not provider_id:
+ return website_settings.DOMAIN # osf is a provider, sure why not
+
+ match osf_model_name: # lower-cased osf.models class names
+ case 'node' | 'osfuser': # implicit untyped 'osf' provider
+ return website_settings.DOMAIN
+ case 'preprint': # match PreprintProvider.get_semantic_iri
+ return f'{website_settings.DOMAIN}preprints/{provider_id}'
+ case 'registration': # match RegistrationProvider.get_semantic_iri
+ return f'{website_settings.DOMAIN}registries/{provider_id}'
+ case _ if 'file' in osf_model_name:
+ # file providers are a different thing that don't really have an iri, just an id
+ return f'urn:files.osf.io:{provider_id}'
+ case _: # give up gracefully
+ _logger.error(
+ f'unknown model {osf_model_name!r} with provider {provider_id!r}'
+ )
+ return f'urn:osf.io:{provider_id}'
+
+
+def _each_usage_report_osfid(until_when, after_osfid=None):
+ _search = (
+ es6_reports.PublicItemUsageReport.search()
+ .filter('range', timestamp={'lt': until_when})
+ .extra(size=0)
+ )
+ _search.aggs.bucket(
+ 'agg_osfid',
+ 'composite',
+ sources=[{'osfid': {'terms': {'field': 'item_osfid'}}}],
+ size=500,
+ )
+ return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid)
+
+
+###
+# the command itself
+
+
+class Command(BaseCommand):
+ def add_arguments(self, parser):
+ parser.add_argument(
+ '--no-setup',
+ action='store_true',
+ )
+ parser.add_argument(
+ '--no-counts',
+ action='store_true',
+ )
+ parser.add_argument(
+ '--clear-state',
+ action='store_true',
+ )
+ parser.add_argument(
+ '--clear-es8-data',
+ action='store_true',
+ )
+ parser.add_argument(
+ '--start',
+ action='store_true',
+ )
+ parser.add_argument(
+ '--unchanged',
+ action='store_true',
+ )
+ parser.add_argument(
+ '--usage-events',
+ action='store_true',
+ )
+ parser.add_argument(
+ '--usage-reports',
+ action='store_true',
+ )
+
+ @functools.cached_property
+ def _migration_started_at(self):
+ return es8_metrics.Elastic6To8State.get_started_at()
+
+ def handle(
+ self,
+ *,
+ no_setup,
+ no_counts,
+ clear_state,
+ clear_es8_data,
+ start,
+ unchanged,
+ usage_events,
+ usage_reports,
+ **kwargs,
+ ):
+ self._quiet_chatty_loggers()
+ if not no_setup:
+ call_command('djelme_backend_setup')
+ if clear_state:
+ self._clear_state()
+ if clear_es8_data:
+ self._clear_es8_data(unchanged, usage_events, usage_reports)
+ self._check_started_at(start_now=start)
+ _default_all = not any((unchanged, usage_events, usage_reports))
+ if unchanged or _default_all:
+ self._handle_unchanged(start=start, no_counts=no_counts)
+ if usage_events or _default_all:
+ self._handle_usage_events(start=start, no_counts=no_counts)
+ if usage_reports or _default_all:
+ self._handle_usage_reports(start=start, no_counts=no_counts)
+ if not no_counts:
+ self.stdout.write('(counts may be approximate)')
+
+ def _handle_unchanged(self, *, start: bool, no_counts: bool):
+ # for each (unchanged) report/event:
+ for _es6_cls, _es8_cls in _UNCHANGED_RECORDTYPES.items():
+ _assert_field_unchangedness(_es6_cls, _es8_cls)
+ if not no_counts:
+ # display counts
+ _es6_count = _es6_cls.search().count()
+ _es8_count = _es8_cls.search().count()
+ self._write_tabbed('es6', _es6_cls, _es6_count)
+ self._write_tabbed(
+ 'es8',
+ _es8_cls,
+ _es8_count,
+ style=self._eq_style(_es8_count, _es6_count),
+ )
+ if start: # schedule task
+ self.stdout.write(
+ f'starting {_es6_cls.__name__} => {_es8_cls.__name__}'
+ )
+ migrate_unchanged_recordtype.delay(
+ _es6_cls.__name__, self._migration_started_at.isoformat()
+ )
+
+ def _handle_usage_events(self, *, start: bool, no_counts: bool):
+ # for counted-usage events:
+ _started = self._migration_started_at or datetime.datetime.now()
+ _range_start = (_started - datetime.timedelta(days=_USAGE_DAYS_BACK)).date()
+ _range_end = _started.date() + datetime.timedelta(days=1)
+ if not no_counts:
+ # display counts for each view/download event type
+ _range_q = {
+ 'range': {
+ 'timestamp': {
+ 'gte': _range_start.isoformat(),
+ 'lt': _range_end.isoformat(),
+ }
+ }
+ }
+ _es6_usage_count_q = {
+ 'bool': {
+ 'filter': [_range_q, {'exists': {'field': 'item_guid'}}],
+ },
+ }
+ _es6_pview_count = PreprintView.search().filter(_range_q).count()
+ _es6_pdownload_count = PreprintDownload.search().filter(_range_q).count()
+ _es6_usage_event_count = CountedUsageEs6.search().filter(_es6_usage_count_q).count()
+ _es6_count = (
+ _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count
+ )
+ _es8_count = es8_metrics.OsfCountedUsageEvent.search().filter(_range_q).count()
+ self._write_tabbed('es6', PreprintView, _es6_pview_count)
+ self._write_tabbed('es6', PreprintDownload, _es6_pdownload_count)
+ self._write_tabbed('es6', CountedUsageEs6, _es6_usage_event_count)
+ self._write_tabbed(
+ 'es6', f'(total between {_range_start} and {_range_end})', _es6_count
+ )
+ self._write_tabbed(
+ 'es8',
+ es8_metrics.OsfCountedUsageEvent,
+ _es8_count,
+ style=self._eq_style(_es8_count, _es6_count),
+ )
+ if start: # schedule (per-day?) tasks (if --start)
+ self.stdout.write(
+ f'starting usages => {es8_metrics.OsfCountedUsageEvent.__name__}'
+ )
+ for _from_date, _until_date in _date_range(_range_start, _range_end):
+ _from_str = _from_date.isoformat()
+ _until_str = _until_date.isoformat()
+ migrate_counted_usages.delay(_from_str, _until_str)
+ migrate_preprint_views.delay(_from_str, _until_str)
+ migrate_preprint_downloads.delay(_from_str, _until_str)
+
+ def _handle_usage_reports(self, *, start: bool, no_counts: bool):
+ if not no_counts:
+ # display counts of reports and distinct items
+ _es6_count, _es6_item_count = _es6_usage_report_counts()
+ _es8_count, _es8_item_count = _es8_usage_report_counts()
+ self._write_tabbed('es6', es6_reports.PublicItemUsageReport, _es6_count)
+ self._write_tabbed(
+ 'es8',
+ es8_metrics.MonthlyPublicItemUsageReportEs8,
+ _es8_count,
+ style=self._eq_style(_es8_count, _es6_count),
+ )
+ self._write_tabbed(
+ 'es6',
+ es6_reports.PublicItemUsageReport,
+ 'osfid count:',
+ _es6_item_count,
+ )
+ self._write_tabbed(
+ 'es8',
+ es8_metrics.MonthlyPublicItemUsageReportEs8,
+ 'osfid count:',
+ _es8_item_count,
+ style=self._eq_style(_es8_item_count, _es6_item_count),
+ )
+ # (if --start) schedule task per item (by composite agg on es6 public usage reports)
+ # each item-task iter thru reports oldest to newest, adding cumulative counts
+ if start:
+ self.stdout.write(
+ f'starting per-item {es6_reports.PublicItemUsageReport.__name__} => {es8_metrics.MonthlyPublicItemUsageReportEs8.__name__}'
+ )
+ for _osfid in _each_usage_report_osfid(
+ until_when=self._migration_started_at
+ ):
+ migrate_usage_reports.delay(
+ _osfid, self._migration_started_at.isoformat()
+ )
+
+ def _check_started_at(self, start_now):
+ _started_at = self._migration_started_at
+ if _started_at:
+ self.stdout.write(
+ f'osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}'
+ )
+ elif start_now:
+ _started_at = es8_metrics.Elastic6To8State.set_started_at_now()
+ del self._migration_started_at # clear cache
+ self.stdout.write(
+ f'osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}'
+ )
+ else:
+ self.stdout.write(
+ 'osf.metrics 6->8 migration not started nor starting (run with `--start` to start)'
+ )
+
+ def _clear_state(self):
+ self.stdout.write(
+ 'clearing all migration state (start time, etc)', self.style.NOTICE
+ )
+ es8_metrics.Elastic6To8State.search().query({'match_all': {}}).delete()
+ es8_metrics.Elastic6To8State.refresh()
+
+ def _clear_es8_data(self, unchanged, usage_events, usage_reports):
+ _default_all = not any((unchanged, usage_events, usage_reports))
+ _to_clear = []
+ if _default_all or unchanged:
+ _to_clear.extend(_UNCHANGED_RECORDTYPES.values())
+ if _default_all or usage_events:
+ _to_clear.append(es8_metrics.MonthlyPublicItemUsageReportEs8)
+ if _default_all or usage_reports:
+ _to_clear.append(es8_metrics.OsfCountedUsageEvent)
+ for _es8_recordtype in _to_clear:
+ self.stdout.write(
+ f'clearing {_es8_recordtype.__name__}', self.style.NOTICE
+ )
+ _es8_recordtype.search().query({'match_all': {}}).delete()
+ _es8_recordtype.refresh()
+
+ def _eq_style(self, num: int, should_be: int):
+ return self.style.SUCCESS if (num == should_be) else self.style.WARNING
+
+ def _write_tabbed(self, *strables, style=None):
+ def _to_str(strable):
+ if isinstance(strable, type):
+ return strable.__name__
+ return str(strable)
+
+ self.stdout.write('\t'.join(map(_to_str, strables)), style)
+
+ def _quiet_chatty_loggers(self):
+ _chatty_loggers = [
+ 'elasticsearch',
+ 'elastic_transport',
+ 'elasticsearch_metrics',
+ ]
+ for logger_name in _chatty_loggers:
+ logging.getLogger(logger_name).setLevel(logging.ERROR)
diff --git a/osf/management/commands/monthly_reporters_go.py b/osf/management/commands/monthly_reporters_go.py
index 83ed5f6d985..c5dd4034777 100644
--- a/osf/management/commands/monthly_reporters_go.py
+++ b/osf/management/commands/monthly_reporters_go.py
@@ -3,12 +3,14 @@
from django.core.management.base import BaseCommand
from django.db import OperationalError as DjangoOperationalError
-from elasticsearch.exceptions import ConnectionError as ElasticConnectionError
+from elasticsearch6.exceptions import ConnectionError as Elastic6ConnectionError
+from elasticsearch8.exceptions import ConnectionError as Elastic8ConnectionError
from psycopg2 import OperationalError as PostgresOperationalError
from framework.celery_tasks import app as celery_app
import framework.sentry
from osf.metrics.reporters import AllMonthlyReporters
+from osf.metrics.reports import MonthlyReport
from osf.metrics.utils import YearMonth
@@ -17,7 +19,8 @@
_CONTINUE_AFTER_ERRORS = (
DjangoOperationalError,
- ElasticConnectionError,
+ Elastic6ConnectionError,
+ Elastic8ConnectionError,
PostgresOperationalError,
)
@@ -70,11 +73,7 @@ def schedule_monthly_reporter(
@celery_app.task(
name='management.commands.monthly_reporter_do',
- autoretry_for=(
- DjangoOperationalError,
- ElasticConnectionError,
- PostgresOperationalError,
- ),
+ autoretry_for=_CONTINUE_AFTER_ERRORS,
max_retries=5,
retry_backoff=True,
)
@@ -85,9 +84,10 @@ def monthly_reporter_do(reporter_key: str, yearmonth: str, report_kwargs: dict):
framework.sentry.log_exception(exc)
return
- _report = _reporter.report(**report_kwargs)
- if _report is not None:
- _report.report_yearmonth = _reporter.yearmonth
+ _reports = _reporter.report(**report_kwargs)
+ for _report in _reports:
+ if isinstance(_report, MonthlyReport) and (_report.report_yearmonth is None):
+ _report.report_yearmonth = _reporter.yearmonth
_report.save()
_followup_task = _reporter.followup_task(_report)
if _followup_task is not None:
diff --git a/osf/management/commands/osf_shell.py b/osf/management/commands/osf_shell.py
index 851895623ac..69443d004be 100644
--- a/osf/management/commands/osf_shell.py
+++ b/osf/management/commands/osf_shell.py
@@ -32,7 +32,7 @@ def get_user_imports():
from django.db.models import Model
from django_extensions.management.commands import shell_plus
from django_extensions.management.utils import signalcommand
-from elasticsearch_metrics.registry import registry as metrics_registry
+from elasticsearch_metrics.registry import djelme_registry
def header(text):
@@ -160,7 +160,7 @@ def get_osf_imports(self):
def get_metrics(self):
return {
each.__name__: each
- for each in metrics_registry.get_metrics()
+ for each in djelme_registry.each_recordtype()
}
def get_grouped_imports(self, options):
diff --git a/osf/management/commands/reindex_es6.py b/osf/management/commands/reindex_es6.py
index c37d0e34f2c..8961ea6fff1 100644
--- a/osf/management/commands/reindex_es6.py
+++ b/osf/management/commands/reindex_es6.py
@@ -4,7 +4,7 @@
import logging
from django.core.management.base import BaseCommand
-from elasticsearch_dsl import connections
+from elasticsearch6_dsl import connections
from elasticsearch_metrics.registry import registry
logger = logging.getLogger(__name__)
diff --git a/osf/management/commands/sync_databases.py b/osf/management/commands/sync_databases.py
index c31d63ea16e..b5030b4bba7 100644
--- a/osf/management/commands/sync_databases.py
+++ b/osf/management/commands/sync_databases.py
@@ -20,7 +20,7 @@ def handle(self, *args, **options):
['migrate'],
]
if waffle.switch_is_active(features.ELASTICSEARCH_METRICS):
- COMMANDS.append(['sync_metrics'])
+ COMMANDS.append(['djelme_backend_setup'])
for check in COMMANDS:
call_command(*check)
diff --git a/osf/metadata/osf_gathering.py b/osf/metadata/osf_gathering.py
index 240e5c96561..74c20131464 100644
--- a/osf/metadata/osf_gathering.py
+++ b/osf/metadata/osf_gathering.py
@@ -13,6 +13,12 @@
from osf import models as osfdb
from osf.metadata import gather
from osf.metadata.definitions.datacite import DATACITE_RESOURCE_TYPES_GENERAL
+from osf.metadata.osfmap_utils import (
+ osfmap_type,
+ osf_iri,
+ is_osf_component,
+ osfid_from_iri,
+)
from osf.metadata.rdfutils import (
DATACITE,
DCAT,
@@ -30,7 +36,6 @@
SKOS,
checksum_iri,
format_dcterms_extent,
- without_namespace,
smells_like_iri,
)
from osf.metrics.reports import PublicItemUsageReport
@@ -319,15 +324,13 @@ def get_expiration_date(self, basket: gather.Basket) -> datetime.date | None:
##### END osfmap #####
-##### BEGIN osf-specific utils #####
-
class OsfFocus(gather.Focus):
def __init__(self, osf_item):
if isinstance(osf_item, str):
osf_item = osfdb.base.coerce_guid(osf_item).referent
super().__init__(
iri=osf_iri(osf_item),
- rdftype=get_rdf_type(osf_item),
+ rdftype=osfmap_type(osf_item),
provider_id=osf_item.provider._id if (osf_item and getattr(osf_item, 'type', '') == 'osf.registration' and osf_item.provider) else None
)
self.dbmodel = osf_item
@@ -337,54 +340,6 @@ def __init__(self, osf_item):
pass # is ok for a focus to be something non-osfguidy
-def is_root(osf_node):
- return (osf_node.root_id == osf_node.id)
-
-
-def get_rdf_type(osfguid_referent):
- if isinstance(osfguid_referent, osfdb.Guid):
- osfguid_referent = osfguid_referent.referent
-
- if isinstance(osfguid_referent, osfdb.OSFUser):
- return DCTERMS.Agent
- if isinstance(osfguid_referent, osfdb.BaseFileNode):
- return OSF.File
- if isinstance(osfguid_referent, osfdb.Preprint):
- return OSF.Preprint
- if isinstance(osfguid_referent, osfdb.Registration):
- return (
- OSF.Registration
- if is_root(osfguid_referent)
- else OSF.RegistrationComponent
- )
- if isinstance(osfguid_referent, osfdb.Node):
- return (
- OSF.Project
- if is_root(osfguid_referent)
- else OSF.ProjectComponent
- )
- raise NotImplementedError
-
-
-def osf_iri(guid_or_model):
- """return a rdflib.URIRef or None
-
- @param guid_or_model: a string, Guid instance, or another osf model instance
- @returns rdflib.URIRef or None
- """
- guid = osfdb.base.coerce_guid(guid_or_model)
- return OSFIO[guid._id]
-
-
-def osfguid_from_iri(iri: str) -> str:
- if iri.startswith(OSFIO):
- return without_namespace(iri, OSFIO)
- raise ValueError(f'expected iri starting with "{OSFIO}" (got "{iri}")')
-
-
-##### END osf-specific utils #####
-
-
##### BEGIN the gatherers #####
#
@@ -720,7 +675,7 @@ def gather_file_mediatype(focus):
@gather.er(DCTERMS.hasPart, DCTERMS.isPartOf)
def gather_parts(focus):
if isinstance(focus.dbmodel, osfdb.AbstractNode):
- if not is_root(focus.dbmodel) and focus.dbmodel.root.is_public:
+ if is_osf_component(focus.dbmodel) and focus.dbmodel.root.is_public:
root_focus = OsfFocus(focus.dbmodel.root)
yield (OSF.hasRoot, root_focus)
child_relations = (
@@ -1132,7 +1087,7 @@ def gather_cedar_templates(focus):
@gather.er(OSF.usage)
def gather_last_month_usage(focus):
_usage_report = PublicItemUsageReport.for_last_month(
- item_osfid=osfguid_from_iri(focus.iri),
+ item_osfid=osfid_from_iri(focus.iri),
)
if _usage_report is not None:
_usage_report_ref = rdflib.BNode()
diff --git a/osf/metadata/osfmap_utils.py b/osf/metadata/osfmap_utils.py
new file mode 100644
index 00000000000..031cd160eac
--- /dev/null
+++ b/osf/metadata/osfmap_utils.py
@@ -0,0 +1,69 @@
+from osf.metadata.rdfutils import (
+ DCTERMS,
+ OSF,
+ OSFIO,
+ without_namespace,
+)
+from osf import models as osfdb
+
+
+def is_osf_component(osf_node) -> bool:
+ return (
+ isinstance(osf_node, osfdb.AbstractNode)
+ and osf_node.root_id != osf_node.id
+ )
+
+
+def osfmap_type_from_model(model_cls, *, is_component=None):
+ if issubclass(model_cls, osfdb.OSFUser):
+ return DCTERMS.Agent
+ if issubclass(model_cls, osfdb.BaseFileNode):
+ return OSF.File
+ if issubclass(model_cls, osfdb.Preprint):
+ return OSF.Preprint
+ if issubclass(model_cls, osfdb.Registration):
+ if is_component is None:
+ raise ValueError(f'osfmap_type_from_model requires `is_component` for {model_cls}')
+ return (
+ OSF.RegistrationComponent
+ if is_component
+ else OSF.Registration
+ )
+ if issubclass(model_cls, osfdb.Node):
+ if is_component is None:
+ raise ValueError(f'osfmap_type_from_model requires `is_component` for {model_cls}')
+ return (
+ OSF.ProjectComponent
+ if is_component
+ else OSF.Project
+ )
+ raise LookupError(model_cls)
+
+
+def osfmap_type(osf_obj):
+ if isinstance(osf_obj, osfdb.Guid):
+ osf_obj = osf_obj.referent
+ return osfmap_type_from_model(type(osf_obj), is_component=is_osf_component(osf_obj))
+
+
+def osf_iri(guid_or_model):
+ """return a rdflib.URIRef or None
+
+ @param guid_or_model: a string, Guid instance, or another osf model instance
+ @returns rdflib.URIRef or None
+ """
+ _osfid: str = (
+ guid_or_model
+ if isinstance(guid_or_model, str)
+ else osfdb.base.coerce_guid(guid_or_model)._id
+ )
+ return OSFIO[_osfid]
+
+
+def osfid_from_iri(iri: str) -> str:
+ if not iri.startswith(OSFIO):
+ raise ValueError(f'expected iri starting with "{OSFIO}" (got {iri!r})')
+ _osfid = without_namespace(iri, OSFIO)
+ if not _osfid or '/' in _osfid:
+ raise ValueError(f'expected iri path with exactly one segment (got {_osfid!r} from {iri!r})')
+ return _osfid
diff --git a/osf/metadata/serializers/linkset.py b/osf/metadata/serializers/linkset.py
index f83dad00ebd..3ee907d0532 100644
--- a/osf/metadata/serializers/linkset.py
+++ b/osf/metadata/serializers/linkset.py
@@ -16,7 +16,7 @@
import rdflib
from ._base import MetadataSerializer
-from osf.metadata.osf_gathering import osfguid_from_iri
+from osf.metadata.osf_gathering import osfid_from_iri
from osf.metadata.rdfutils import (DOI, DATACITE, DCTERMS, OWL, RDF, OSF, DCAT, SCHEMA, DATACITE_SCHEMA_RESOURCE_TYPE_GENERAL_MAPPING, map_resource_type_general_datacite_to_scheme)
from website.settings import DOMAIN
from website.util import web_url_for
@@ -74,7 +74,7 @@ def _each_link(self) -> Iterator[SignpostLink]:
base_metadata_url = urljoin(DOMAIN, web_url_for(
'metadata_download', # name of a view function mapped in website/routes.py
- guid=osfguid_from_iri(self.basket.focus.iri),
+ guid=osfid_from_iri(self.basket.focus.iri),
))
split_base_metadata_url = urlsplit(base_metadata_url)
diff --git a/osf/metrics/__init__.py b/osf/metrics/__init__.py
index 0e7b1a1cf32..6056e6d92f3 100644
--- a/osf/metrics/__init__.py
+++ b/osf/metrics/__init__.py
@@ -17,6 +17,8 @@
StorageAddonUsage,
UserSummaryReport,
)
+from . import es8_metrics
+
DAILY_REPORTS = (
DownloadCountReport,
@@ -36,4 +38,5 @@
'PreprintView',
'PreprintDownload',
'RegistriesModerationMetrics',
+ 'es8_metrics',
)
diff --git a/osf/metrics/counted_usage.py b/osf/metrics/counted_usage.py
index 39b3b74129b..41ea012fda5 100644
--- a/osf/metrics/counted_usage.py
+++ b/osf/metrics/counted_usage.py
@@ -4,7 +4,7 @@
from urllib.parse import urlsplit
from elasticsearch6_dsl import InnerDoc, analyzer, tokenizer
-from elasticsearch_metrics import metrics
+import elasticsearch_metrics.imps.elastic6 as metrics
from elasticsearch_metrics.signals import pre_save
from django.dispatch import receiver
import pytz
diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py
new file mode 100644
index 00000000000..1d73009ed5b
--- /dev/null
+++ b/osf/metrics/es8_metrics.py
@@ -0,0 +1,579 @@
+import datetime
+import enum
+import functools
+from urllib.parse import urlsplit
+
+import elasticsearch8.dsl as esdsl
+from elasticsearch_metrics import DAILY, MONTHLY, YEARLY
+import elasticsearch_metrics.imps.elastic8 as djelme
+
+from osf.metadata.osfmap_utils import (
+ osfmap_type,
+ osf_iri,
+ osfid_from_iri,
+)
+from osf.metrics.counted_usage import _get_surrounding_guids
+from osf.metrics.utils import YearMonth
+from osf import models as osfdb
+from website import settings as website_settings
+
+
+###
+# custom dsl fields
+
+class YearmonthField(esdsl.Date):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs, format='strict_year_month')
+
+ def deserialize(self, data):
+ if isinstance(data, int):
+ # elasticsearch stores dates in milliseconds since the unix epoch
+ _as_datetime = datetime.datetime.fromtimestamp(data // 1000)
+ return YearMonth.from_date(_as_datetime)
+ elif data is None:
+ return None
+ try:
+ return YearMonth.from_any(data)
+ except ValueError:
+ raise ValueError(f'unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth')
+
+ def serialize(self, data, skip_empty=True):
+ if isinstance(data, str):
+ return data
+ elif isinstance(data, YearMonth):
+ return str(data)
+ elif isinstance(data, (datetime.datetime, datetime.date)):
+ return str(YearMonth.from_date(data))
+ elif data is None:
+ return None
+ else:
+ raise ValueError(f'unsure how to serialize "{data}" (of type {type(data)}) as YYYY-MM')
+
+
+###
+# inner objects for events
+
+route_prefix_analyzer = esdsl.analyzer(
+ 'route_prefix_analyzer',
+ tokenizer=esdsl.tokenizer('route_prefix_tokenizer', 'path_hierarchy', delimiter='.'),
+)
+
+
+class PageviewInfo(esdsl.InnerDoc):
+ """PageviewInfo
+
+ for CountedAuthUsage generated by viewing a web page
+ """
+
+ # fields that should be provided
+ referer_url: str | None
+ page_url: str | None
+ page_title: str | None
+ route_name: str | None = esdsl.mapped_field(esdsl.Keyword(
+ fields={
+ 'by_prefix': esdsl.Text(analyzer=route_prefix_analyzer),
+ },
+ ))
+
+ # fields auto-filled
+ page_path: str | None
+ referer_domain: str | None
+ hour_of_day: int | None
+
+
+###
+# Event records
+
+class OsfCountedUsageEvent(djelme.CountedUsageRecord):
+ '''
+ Aim to support a COUNTER-style reporting api
+ https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html
+ https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html
+ '''
+ UNIQUE_TOGETHER_FIELDS = (
+ 'platform_iri',
+ 'sessionhour_id',
+ 'action_labels',
+ # include some non-field properties for more complex logic to
+ # slightly better approximate `counter:Double-Click Filtering`
+ # and allow for multiple pages describing the same item_iri
+ '_page_url_or_osfid', # non-field property
+ '_timestamp_date', # non-field property
+ '_timestamp_30sec_window', # non-field property
+ )
+
+ # inherited fields:
+ # timestamp: datetime.datetime
+ # platform_iri: str
+ # database_iri: str
+ # item_iri: str
+ # sessionhour_id: str
+ # within_iris: list[str]
+
+ # osf-specific fields:
+ item_osfid: str
+ item_type: str
+ item_public: bool
+ provider_id: str | None
+ user_is_authenticated: bool
+ action_labels: list[str]
+ pageview_info: PageviewInfo | None
+
+ class Meta:
+ timeseries_index_timedepth = MONTHLY
+
+ class ActionLabel(enum.Enum):
+ SEARCH = 'search' # counter:Search
+ VIEW = 'view' # counter:Investigation
+ DOWNLOAD = 'download' # counter:Request
+ WEB = 'web' # counter:Regular (aka "pageview")
+ API = 'api' # counter:TDM (aka "non-web api usage")
+
+ @classmethod
+ def record(cls, **kwargs):
+ # autofill `user_is_authenticated` before `user_id` discarded (couldn't in `clean`)
+ if 'user_is_authenticated' not in kwargs:
+ kwargs['user_is_authenticated'] = bool(kwargs.get('user_id'))
+ return super().record(**kwargs)
+
+ @property
+ def _page_url_or_osfid(self):
+ # for UNIQUE_TOGETHER_FIELDS
+ return (
+ self.pageview_info.page_url
+ if self.pageview_info is not None and self.pageview_info.page_url is not None
+ else self.item_osfid
+ )
+
+ @property
+ def _timestamp_date(self):
+ # for UNIQUE_TOGETHER_FIELDS
+ return self.timestamp.date()
+
+ @property
+ def _timestamp_30sec_window(self):
+ # for UNIQUE_TOGETHER_FIELDS
+ # slice the day into an array of 30-second windows,
+ # find this timestamp's windowslice index
+ _day_start = datetime.datetime(
+ self.timestamp.year,
+ self.timestamp.month,
+ self.timestamp.day,
+ tzinfo=self.timestamp.tzinfo,
+ )
+ _time_in_seconds = (self.timestamp - _day_start).total_seconds()
+ return int(_time_in_seconds / 30) # 30-second windows
+
+ @functools.cached_property
+ def _osfid_referent(self):
+ # for use by autofill methods, if needed
+ _osfguid = osfdb.Guid.load(self.item_osfid)
+ return _osfguid.referent if _osfguid else None
+
+ def clean(self):
+ self._autofill_platform_iri()
+ self._autofill_item_iri_and_osfid()
+ self._autofill_item_public()
+ self._autofill_item_type()
+ self._autofill_provider_id()
+ self._autofill_within_iris()
+ self._autofill_pageview()
+ self._autofill_database_iri()
+ self._clean_action_labels()
+ super().clean()
+
+ def _autofill_platform_iri(self):
+ if self.platform_iri is None:
+ self.platform_iri = website_settings.DOMAIN
+
+ def _autofill_item_iri_and_osfid(self):
+ if self.item_osfid and not self.item_iri:
+ self.item_iri = osf_iri(self.item_osfid)
+ elif self.item_iri and not self.item_osfid:
+ try:
+ self.item_osfid = osfid_from_iri(self.item_iri)
+ except ValueError:
+ pass
+
+ def _autofill_item_public(self):
+ if self.item_osfid and (self.item_public is None):
+ _item = self._osfid_referent
+ # if it quacks like BaseFileNode, look at .target instead
+ _item = getattr(_item, 'target', None) or _item
+ self.item_public = (
+ _item.verified_publishable # quacks like Preprint
+ if hasattr(_item, 'verified_publishable')
+ else getattr(_item, 'is_public', False) # quacks like AbstractNode
+ )
+
+ def _autofill_item_type(self):
+ if self.item_osfid and not self.item_type:
+ self.item_type = osfmap_type(self._osfid_referent)
+
+ def _autofill_provider_id(self):
+ if self.item_osfid and not self.provider_id:
+ _provider = getattr(self._osfid_referent, 'provider', None)
+ if _provider is None:
+ self.provider_id = 'osf' # quacks like Node, Comment, WikiPage
+ elif isinstance(_provider, str):
+ self.provider_id = _provider # quacks like BaseFileNode
+ else:
+ self.provider_id = _provider._id # quacks like Registration, Preprint, Collection
+
+ def _autofill_within_iris(self):
+ if self.item_osfid and (self.within_iris is None) and self._osfid_referent:
+ self.within_iris = [
+ osf_iri(_osfid)
+ for _osfid in _get_surrounding_guids(self._osfid_referent)
+ ]
+ # ensure inclusive "within"
+ if not self.within_iris:
+ self.within_iris = [self.item_iri]
+ if self.item_iri not in self.within_iris:
+ self.within_iris = [self.item_iri, *self.within_iris]
+
+ def _autofill_pageview(self):
+ # autofill pageview_info fields from other fields
+ if self.pageview_info:
+ self.pageview_info.hour_of_day = self.timestamp.hour
+ _url = self.pageview_info.page_url
+ if _url:
+ self.pageview_info.page_path = urlsplit(_url).path.rstrip('/')
+ _ref_url = self.pageview_info.referer_url
+ if _ref_url:
+ self.pageview_info.referer_domain = urlsplit(_ref_url).netloc
+
+ def _autofill_database_iri(self):
+ if self.item_osfid and not self.database_iri:
+ _provider = getattr(self._osfid_referent, 'provider', None)
+ if not _provider:
+ self.database_iri = website_settings.DOMAIN
+ elif isinstance(_provider, str):
+ # file providers are a different thing that don't really have an iri, just an id
+ self.database_iri = f'urn:files.osf.io:{self.provider_id}'
+ else:
+ self.database_iri = _provider.get_semantic_iri()
+
+ def _clean_action_labels(self):
+ if self.action_labels:
+ self.action_labels = sorted(self.action_labels)
+
+
+class RegistriesModerationEventEs8(djelme.EventRecord):
+ UNIQUE_TOGETHER_FIELDS = (
+ 'timestamp', 'registration_id', 'trigger', 'from_state', 'to_state', 'user_id'
+ )
+
+ registration_id: str
+ provider_id: str
+ trigger: str
+ from_state: str
+ to_state: str
+ user_id: str
+ comment: str | None
+
+ class Meta:
+ timeseries_recordtype_name = 'RegistriesModerationEvent'
+ timeseries_index_timedepth = MONTHLY
+
+
+###
+# Reusable inner objects for reports
+
+class RunningTotal(esdsl.InnerDoc):
+ total: int
+ total_daily: int | None
+
+
+class FileRunningTotals(esdsl.InnerDoc):
+ total: int
+ public: int
+ private: int
+ total_daily: int
+ public_daily: int
+ private_daily: int
+
+
+class NodeRunningTotals(esdsl.InnerDoc):
+ total: int
+ total_excluding_spam: int | None
+ public: int
+ private: int
+ total_daily: int
+ total_daily_excluding_spam: int | None
+ public_daily: int
+ private_daily: int
+
+
+class RegistrationRunningTotals(esdsl.InnerDoc):
+ total: int
+ public: int
+ embargoed: int
+ embargoed_v2: int
+ withdrawn: int | None
+ total_daily: int
+ public_daily: int
+ embargoed_daily: int
+ embargoed_v2_daily: int
+ withdrawn_daily: int | None
+
+
+class UsageByStorageAddon(esdsl.InnerDoc):
+ addon_shortname: str
+ enabled_usersettings: RunningTotal
+ linked_usersettings: RunningTotal
+ deleted_usersettings: RunningTotal
+ usersetting_links: RunningTotal
+ connected_nodesettings: RunningTotal
+ disconnected_nodesettings: RunningTotal
+ deleted_nodesettings: RunningTotal
+
+
+###
+# Cyclic reports
+
+
+class DailyStorageAddonUsageReportEs8(djelme.CyclicRecord):
+ CYCLE_TIMEDEPTH = DAILY
+
+ usage_by_addon: list[UsageByStorageAddon]
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+ timeseries_recordtype_name = 'DailyStorageAddonUsageReport'
+
+
+class DailyDownloadCountReportEs8(djelme.CyclicRecord):
+ CYCLE_TIMEDEPTH = DAILY
+
+ daily_file_downloads: int
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+ timeseries_recordtype_name = 'DailyDownloadCountReport'
+
+
+class DailyInstitutionSummaryReportEs8(djelme.CyclicRecord):
+ CYCLE_TIMEDEPTH = DAILY
+ UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id',)
+
+ institution_id: str
+ institution_name: str
+ users: RunningTotal
+ nodes: NodeRunningTotals
+ projects: NodeRunningTotals
+ registered_nodes: RegistrationRunningTotals
+ registered_projects: RegistrationRunningTotals
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+ timeseries_recordtype_name = 'DailyInstitutionSummaryReport'
+
+
+class DailyNewUserDomainReportEs8(djelme.CyclicRecord):
+ CYCLE_TIMEDEPTH = DAILY
+ UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'domain_name',)
+
+ domain_name: str
+ new_user_count: int
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+ timeseries_recordtype_name = 'DailyNewUserDomainReport'
+
+
+class DailyNodeSummaryReportEs8(djelme.CyclicRecord):
+ CYCLE_TIMEDEPTH = DAILY
+
+ nodes: NodeRunningTotals
+ projects: NodeRunningTotals
+ registered_nodes: RegistrationRunningTotals
+ registered_projects: RegistrationRunningTotals
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+ timeseries_recordtype_name = 'DailyNodeSummaryReport'
+
+
+class DailyOsfstorageFileCountReportEs8(djelme.CyclicRecord):
+ CYCLE_TIMEDEPTH = DAILY
+
+ files: FileRunningTotals
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+ timeseries_recordtype_name = 'DailyOsfstorageFileCountReport'
+
+
+class DailyPreprintSummaryReportEs8(djelme.CyclicRecord):
+ CYCLE_TIMEDEPTH = DAILY
+
+ UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'provider_key',)
+ provider_key: str
+ preprint_count: int
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+ timeseries_recordtype_name = 'DailyPreprintSummaryReport'
+
+
+class DailyUserSummaryReportEs8(djelme.CyclicRecord):
+ CYCLE_TIMEDEPTH = DAILY
+
+ active: int
+ deactivated: int
+ merged: int
+ new_users_daily: int
+ new_users_with_institution_daily: int
+ unconfirmed: int
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+ timeseries_recordtype_name = 'DailyUserSummaryReport'
+
+
+class MonthlySpamSummaryReportEs8(djelme.CyclicRecord):
+ CYCLE_TIMEDEPTH = MONTHLY
+
+ node_confirmed_spam: int
+ node_confirmed_ham: int
+ node_flagged: int
+ registration_confirmed_spam: int
+ registration_confirmed_ham: int
+ registration_flagged: int
+ preprint_confirmed_spam: int
+ preprint_confirmed_ham: int
+ preprint_flagged: int
+ user_marked_as_spam: int
+ user_marked_as_ham: int
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+ timeseries_recordtype_name = 'MonthlySpamSummaryReport'
+
+
+class MonthlyInstitutionalUserReportEs8(djelme.CyclicRecord):
+ CYCLE_TIMEDEPTH = MONTHLY
+ UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', 'user_id',)
+
+ institution_id: str
+ # user info:
+ user_id: str
+ user_name: str
+ department_name: str | None
+ month_last_login = YearmonthField()
+ month_last_active = YearmonthField()
+ account_creation_date = YearmonthField()
+ orcid_id: str | None
+ # counts:
+ public_project_count: int
+ private_project_count: int
+ public_registration_count: int
+ embargoed_registration_count: int
+ published_preprint_count: int
+ public_file_count: int = esdsl.mapped_field(esdsl.Long())
+ storage_byte_count: int = esdsl.mapped_field(esdsl.Long())
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+ timeseries_recordtype_name = 'MonthlyInstitutionalUserReport'
+
+
+class MonthlyInstitutionSummaryReportEs8(djelme.CyclicRecord):
+ CYCLE_TIMEDEPTH = MONTHLY
+ UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', )
+
+ institution_id: str
+ user_count: int
+ public_project_count: int
+ private_project_count: int
+ public_registration_count: int
+ embargoed_registration_count: int
+ published_preprint_count: int
+ storage_byte_count: int = esdsl.mapped_field(esdsl.Long())
+ public_file_count: int = esdsl.mapped_field(esdsl.Long())
+ monthly_logged_in_user_count: int = esdsl.mapped_field(esdsl.Long())
+ monthly_active_user_count: int = esdsl.mapped_field(esdsl.Long())
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+ timeseries_recordtype_name = 'MonthlyInstitutionSummaryReport'
+
+
+class MonthlyPublicItemUsageReportEs8(djelme.CyclicRecord):
+ CYCLE_TIMEDEPTH = MONTHLY
+ UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'item_osfid')
+
+ # where noted, fields are meant to correspond to defined terms from COUNTER
+ # https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html
+ # https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html
+ item_osfid: str
+ item_type: list[str] # counter:Data-Type
+ provider_id: list[str] # counter:Database(?)
+ platform_iri: list[str] # counter:Platform
+
+ # view counts include views on components or files contained by this item
+ view_count: int = esdsl.mapped_field(esdsl.Long())
+ view_session_count: int = esdsl.mapped_field(esdsl.Long())
+ cumulative_view_count: int = esdsl.mapped_field(esdsl.Long())
+ cumulative_view_session_count: int = esdsl.mapped_field(esdsl.Long())
+
+ # download counts of this item only (not including contained components or files)
+ download_count: int = esdsl.mapped_field(esdsl.Long())
+ download_session_count: int = esdsl.mapped_field(esdsl.Long())
+ cumulative_download_count: int = esdsl.mapped_field(esdsl.Long())
+ cumulative_download_session_count: int = esdsl.mapped_field(esdsl.Long())
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+ timeseries_recordtype_name = 'MonthlyPublicItemUsageReport'
+
+
+class MonthlyPrivateSpamMetricsReportEs8(djelme.CyclicRecord):
+ CYCLE_TIMEDEPTH = MONTHLY
+
+ node_oopspam_flagged: int
+ node_oopspam_hammed: int
+ node_akismet_flagged: int
+ node_akismet_hammed: int
+ preprint_oopspam_flagged: int
+ preprint_oopspam_hammed: int
+ preprint_akismet_flagged: int
+ preprint_akismet_hammed: int
+
+ class Meta:
+ timeseries_index_timedepth = YEARLY
+ timeseries_recordtype_name = 'MonthlyPrivateSpamMetricsReport'
+
+
+###
+# data migration state
+
+class Elastic6To8State(djelme.SimpleRecord):
+ """index for storing values helpful for keeping track of the elastic 6->8 data migration"""
+ UNIQUE_TOGETHER_FIELDS = ('key',)
+ key: str
+ value: str | None
+ timestamp: datetime.datetime = esdsl.mapped_field(
+ default_factory=lambda: datetime.datetime.now(datetime.UTC),
+ )
+
+ @classmethod
+ def get_by_key(cls, key: str):
+ _response = cls.search().query({'term': {'key': key}})[0].execute()
+ return _response[0] if _response else None
+
+ @classmethod
+ def get_timestamp(cls, key: str) -> datetime.datetime | None:
+ _record = cls.get_by_key(key)
+ return _record.timestamp if _record else None
+
+ @classmethod
+ def get_started_at(cls):
+ return cls.get_timestamp('started_at')
+
+ @classmethod
+ def set_started_at_now(cls):
+ _record = cls.record(key='started_at')
+ cls.refresh()
+ return _record.timestamp
diff --git a/osf/metrics/metric_mixin.py b/osf/metrics/metric_mixin.py
index 724ab1958da..df87d5123b1 100644
--- a/osf/metrics/metric_mixin.py
+++ b/osf/metrics/metric_mixin.py
@@ -2,7 +2,7 @@
from django.db import models
from django.utils import timezone
-from elasticsearch.exceptions import NotFoundError
+from elasticsearch6.exceptions import NotFoundError
import pytz
diff --git a/osf/metrics/preprint_metrics.py b/osf/metrics/preprint_metrics.py
index 9d02ec191a2..d284d80827e 100644
--- a/osf/metrics/preprint_metrics.py
+++ b/osf/metrics/preprint_metrics.py
@@ -1,5 +1,5 @@
-from elasticsearch.exceptions import NotFoundError
-from elasticsearch_metrics import metrics
+from elasticsearch6.exceptions import NotFoundError
+import elasticsearch_metrics.imps.elastic6 as metrics
from .metric_mixin import MetricMixin
diff --git a/osf/metrics/registry_metrics.py b/osf/metrics/registry_metrics.py
index 475dca28673..9c779fe8c0b 100644
--- a/osf/metrics/registry_metrics.py
+++ b/osf/metrics/registry_metrics.py
@@ -1,4 +1,4 @@
-from elasticsearch_metrics import metrics
+import elasticsearch_metrics.imps.elastic6 as metrics
from osf.utils.workflows import RegistrationModerationTriggers, RegistrationModerationStates
from .metric_mixin import MetricMixin
diff --git a/osf/metrics/reporters/download_count.py b/osf/metrics/reporters/download_count.py
index f772722dc31..4350c1440a1 100644
--- a/osf/metrics/reporters/download_count.py
+++ b/osf/metrics/reporters/download_count.py
@@ -1,14 +1,22 @@
from osf.models import PageCounter
from osf.metrics.reports import DownloadCountReport
+from osf.metrics.es8_metrics import DailyDownloadCountReportEs8
+from osf.metrics.utils import cycle_coverage_date
from ._base import DailyReporter
class DownloadCountReporter(DailyReporter):
def report(self, date):
download_count = int(PageCounter.get_all_downloads_on_date(date) or 0)
- return [
- DownloadCountReport(
- daily_file_downloads=download_count,
- report_date=date,
- ),
- ]
+ reports = []
+ report_es8 = DailyDownloadCountReportEs8(
+ cycle_coverage=cycle_coverage_date(date),
+ daily_file_downloads=download_count,
+ )
+ reports.append(report_es8)
+ report = DownloadCountReport(
+ daily_file_downloads=report_es8.daily_file_downloads,
+ report_date=date,
+ )
+ reports.append(report)
+ return reports
diff --git a/osf/metrics/reporters/institution_summary.py b/osf/metrics/reporters/institution_summary.py
index 892e337aec4..1148f2456e5 100644
--- a/osf/metrics/reporters/institution_summary.py
+++ b/osf/metrics/reporters/institution_summary.py
@@ -9,9 +9,15 @@
RegistrationRunningTotals,
)
from osf.models import Institution
+from osf.metrics.es8_metrics import (
+ DailyInstitutionSummaryReportEs8,
+ RunningTotal as RunningTotalEs8,
+ NodeRunningTotals as NodeRunningTotalsEs8,
+ RegistrationRunningTotals as RegistrationRunningTotalsEs8
+)
+from osf.metrics.utils import cycle_coverage_date
from ._base import DailyReporter
-
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
@@ -39,16 +45,15 @@ def report(self, date):
created__date__lte=date,
type='osf.registration',
)
-
- report = InstitutionSummaryReport(
- report_date=date,
+ report_es8 = DailyInstitutionSummaryReportEs8(
+ cycle_coverage=cycle_coverage_date(date),
institution_id=institution._id,
institution_name=institution.name,
- users=RunningTotal(
+ users=RunningTotalEs8(
total=institution.get_institution_users().filter(is_active=True).count(),
total_daily=institution.get_institution_users().filter(date_confirmed__date=date).count(),
),
- nodes=NodeRunningTotals(
+ nodes=NodeRunningTotalsEs8(
total=node_qs.count(),
public=node_qs.filter(public_query).count(),
private=node_qs.filter(private_query).count(),
@@ -58,7 +63,7 @@ def report(self, date):
private_daily=node_qs.filter(private_query & daily_query).count(),
),
# Projects use get_roots to remove children
- projects=NodeRunningTotals(
+ projects=NodeRunningTotalsEs8(
total=node_qs.get_roots().count(),
public=node_qs.filter(public_query).get_roots().count(),
private=node_qs.filter(private_query).get_roots().count(),
@@ -67,7 +72,7 @@ def report(self, date):
public_daily=node_qs.filter(public_query & daily_query).get_roots().count(),
private_daily=node_qs.filter(private_query & daily_query).get_roots().count(),
),
- registered_nodes=RegistrationRunningTotals(
+ registered_nodes=RegistrationRunningTotalsEs8(
total=registration_qs.count(),
public=registration_qs.filter(public_query).count(),
embargoed=registration_qs.filter(private_query).count(),
@@ -78,7 +83,7 @@ def report(self, date):
embargoed_daily=registration_qs.filter(private_query & daily_query).count(),
embargoed_v2_daily=registration_qs.filter(private_query & daily_query & embargo_v2_query).count(),
),
- registered_projects=RegistrationRunningTotals(
+ registered_projects=RegistrationRunningTotalsEs8(
total=registration_qs.get_roots().count(),
public=registration_qs.filter(public_query).get_roots().count(),
embargoed=registration_qs.filter(private_query).get_roots().count(),
@@ -87,7 +92,60 @@ def report(self, date):
total_daily=registration_qs.filter(daily_query).get_roots().count(),
public_daily=registration_qs.filter(public_query & daily_query).get_roots().count(),
embargoed_daily=registration_qs.filter(private_query & daily_query).get_roots().count(),
- embargoed_v2_daily=registration_qs.filter(private_query & daily_query & embargo_v2_query).get_roots().count(),
+ embargoed_v2_daily=registration_qs.filter(
+ private_query & daily_query & embargo_v2_query).get_roots().count(),
+ ),
+ )
+ reports.append(report_es8)
+
+ report = InstitutionSummaryReport(
+ report_date=date,
+ institution_id=institution._id,
+ institution_name=institution.name,
+ users=RunningTotal(
+ total=report_es8.users.total,
+ total_daily=report_es8.users.total_daily,
+ ),
+ nodes=NodeRunningTotals(
+ total=report_es8.nodes.total,
+ public=report_es8.nodes.public,
+ private=report_es8.nodes.private,
+
+ total_daily=report_es8.nodes.total_daily,
+ public_daily=report_es8.nodes.public_daily,
+ private_daily=report_es8.nodes.private_daily,
+ ),
+ # Projects use get_roots to remove children
+ projects=NodeRunningTotals(
+ total=report_es8.projects.total,
+ public=report_es8.projects.public,
+ private=report_es8.projects.private,
+
+ total_daily=report_es8.projects.total_daily,
+ public_daily=report_es8.projects.public_daily,
+ private_daily=report_es8.projects.private_daily,
+ ),
+ registered_nodes=RegistrationRunningTotals(
+ total=report_es8.registered_nodes.total,
+ public=report_es8.registered_nodes.public,
+ embargoed=report_es8.registered_nodes.embargoed,
+ embargoed_v2=report_es8.registered_nodes.embargoed_v2,
+
+ total_daily=report_es8.registered_nodes.total_daily,
+ public_daily=report_es8.registered_nodes.public_daily,
+ embargoed_daily=report_es8.registered_nodes.embargoed_daily,
+ embargoed_v2_daily=report_es8.registered_nodes.embargoed_v2_daily,
+ ),
+ registered_projects=RegistrationRunningTotals(
+ total=report_es8.registered_projects.total,
+ public=report_es8.registered_projects.public,
+ embargoed=report_es8.registered_projects.embargoed,
+ embargoed_v2=report_es8.registered_projects.embargoed_v2,
+
+ total_daily=report_es8.registered_projects.total_daily,
+ public_daily=report_es8.registered_projects.public_daily,
+ embargoed_daily=report_es8.registered_projects.embargoed_daily,
+ embargoed_v2_daily=report_es8.registered_projects.embargoed_v2_daily,
),
)
diff --git a/osf/metrics/reporters/institution_summary_monthly.py b/osf/metrics/reporters/institution_summary_monthly.py
index 4748860db32..88d8e1fb891 100644
--- a/osf/metrics/reporters/institution_summary_monthly.py
+++ b/osf/metrics/reporters/institution_summary_monthly.py
@@ -5,9 +5,10 @@
from osf.models.spam import SpamStatus
from addons.osfstorage.models import OsfStorageFile
from osf.metrics.reports import InstitutionMonthlySummaryReport
+from osf.metrics.es8_metrics import MonthlyInstitutionSummaryReportEs8
+from osf.metrics.utils import cycle_coverage_yearmonth
from ._base import MonthlyReporter
-
class InstitutionalSummaryMonthlyReporter(MonthlyReporter):
"""Generate an InstitutionMonthlySummaryReport for each institution."""
@@ -20,7 +21,8 @@ def iter_report_kwargs(self, continue_after: dict | None = None):
def report(self, **report_kwargs):
_institution = Institution.objects.get(pk=report_kwargs['institution_pk'])
- return self.generate_report(_institution)
+ reports = self.generate_report(_institution)
+ return reports
def generate_report(self, institution):
node_queryset = institution.nodes.filter(
@@ -31,8 +33,9 @@ def generate_report(self, institution):
)
preprint_queryset = self.get_published_preprints(institution, self.yearmonth)
-
- return InstitutionMonthlySummaryReport(
+ reports = []
+ report_es8 = MonthlyInstitutionSummaryReportEs8(
+ cycle_coverage=cycle_coverage_yearmonth(self.yearmonth),
institution_id=institution._id,
user_count=institution.get_institution_users().count(),
private_project_count=self._get_count(node_queryset, 'osf.node', is_public=False),
@@ -45,6 +48,23 @@ def generate_report(self, institution):
monthly_logged_in_user_count=self.get_monthly_logged_in_user_count(institution, self.yearmonth),
monthly_active_user_count=self.get_monthly_active_user_count(institution, self.yearmonth),
)
+ reports.append(report_es8)
+
+ report = InstitutionMonthlySummaryReport(
+ institution_id=report_es8.institution_id,
+ user_count=report_es8.user_count,
+ private_project_count=report_es8.private_project_count,
+ public_project_count=report_es8.public_project_count,
+ public_registration_count=report_es8.public_registration_count,
+ embargoed_registration_count=report_es8.embargoed_registration_count,
+ published_preprint_count=report_es8.published_preprint_count,
+ storage_byte_count=report_es8.storage_byte_count,
+ public_file_count=report_es8.public_file_count,
+ monthly_logged_in_user_count=report_es8.monthly_logged_in_user_count,
+ monthly_active_user_count=report_es8.monthly_active_user_count,
+ )
+ reports.append(report)
+ return reports
def _get_count(self, node_queryset, node_type, is_public):
return node_queryset.filter(type=node_type, is_public=is_public, root_id=F('pk')).count()
diff --git a/osf/metrics/reporters/institutional_users.py b/osf/metrics/reporters/institutional_users.py
index 512472a3d96..31c31f8ff22 100644
--- a/osf/metrics/reporters/institutional_users.py
+++ b/osf/metrics/reporters/institutional_users.py
@@ -1,4 +1,5 @@
import dataclasses
+from typing import List
from django.contrib.contenttypes.models import ContentType
from django.db.models import Q, F, Sum
@@ -7,7 +8,8 @@
from osf.models.spam import SpamStatus
from addons.osfstorage.models import OsfStorageFile
from osf.metrics.reports import InstitutionalUserReport
-from osf.metrics.utils import YearMonth
+from osf.metrics.utils import YearMonth, cycle_coverage_yearmonth
+from osf.metrics.es8_metrics import MonthlyInstitutionalUserReportEs8
from ._base import MonthlyReporter
@@ -38,7 +40,8 @@ def report(self, **report_kwargs):
_institution = osfdb.Institution.objects.get(pk=report_kwargs['institution_pk'])
_user = osfdb.OSFUser.objects.get(pk=report_kwargs['user_pk'])
_helper = _InstiUserReportHelper(_institution, _user, self.yearmonth)
- return _helper.report
+ _report = next(r for r in _helper.reports if isinstance(r, InstitutionalUserReport))
+ return _report
# helper
@@ -47,11 +50,13 @@ class _InstiUserReportHelper:
institution: osfdb.Institution
user: osfdb.OSFUser
yearmonth: YearMonth
- report: InstitutionalUserReport = dataclasses.field(init=False)
+ reports: List[InstitutionalUserReport | MonthlyInstitutionalUserReportEs8] = dataclasses.field(init=False)
def __post_init__(self):
_affiliation = self.user.get_institution_affiliation(self.institution._id)
- self.report = InstitutionalUserReport(
+ self.reports = []
+ report_es8 = MonthlyInstitutionalUserReportEs8(
+ cycle_coverage=cycle_coverage_yearmonth(self.yearmonth),
institution_id=self.institution._id,
user_id=self.user._id,
user_name=self.user.fullname,
@@ -72,6 +77,25 @@ def __post_init__(self):
published_preprint_count=self._published_preprint_queryset().count(),
storage_byte_count=self._storage_byte_count(),
)
+ self.reports.append(report_es8)
+ report = InstitutionalUserReport(
+ institution_id=report_es8.institution_id,
+ user_id=report_es8.user_id,
+ user_name=report_es8.user_name,
+ department_name=report_es8.department_name,
+ month_last_login=report_es8.month_last_login,
+ month_last_active=report_es8.month_last_active,
+ account_creation_date=report_es8.account_creation_date,
+ orcid_id=report_es8.orcid_id,
+ public_project_count=report_es8.public_project_count,
+ private_project_count=report_es8.private_project_count,
+ public_registration_count=report_es8.public_registration_count,
+ embargoed_registration_count=report_es8.embargoed_registration_count,
+ public_file_count=report_es8.public_file_count,
+ published_preprint_count=report_es8.published_preprint_count,
+ storage_byte_count=report_es8.storage_byte_count,
+ )
+ self.reports.append(report)
@property
def before_datetime(self):
diff --git a/osf/metrics/reporters/new_user_domain.py b/osf/metrics/reporters/new_user_domain.py
index ec13aad860f..125e02754d7 100644
--- a/osf/metrics/reporters/new_user_domain.py
+++ b/osf/metrics/reporters/new_user_domain.py
@@ -3,6 +3,8 @@
from osf.models import OSFUser
from osf.metrics.reports import NewUserDomainReport
+from osf.metrics.es8_metrics import DailyNewUserDomainReportEs8
+from osf.metrics.utils import cycle_coverage_date
from ._base import DailyReporter
logger = logging.getLogger(__name__)
@@ -20,11 +22,19 @@ def report(self, date):
email.split('@')[-1]
for email in new_user_emails
)
- return [
- NewUserDomainReport(
- report_date=date,
+ reports = []
+ for domain_name, count in domain_names.items():
+ report_es8 = DailyNewUserDomainReportEs8(
+ cycle_coverage=cycle_coverage_date(date),
domain_name=domain_name,
new_user_count=count,
)
- for domain_name, count in domain_names.items()
- ]
+ reports.append(report_es8)
+
+ report = NewUserDomainReport(
+ report_date=date,
+ domain_name=report_es8.domain_name,
+ new_user_count=report_es8.new_user_count,
+ )
+ reports.append(report)
+ return reports
diff --git a/osf/metrics/reporters/node_count.py b/osf/metrics/reporters/node_count.py
index 0a4120ca1f9..23f4c9bb78c 100644
--- a/osf/metrics/reporters/node_count.py
+++ b/osf/metrics/reporters/node_count.py
@@ -7,9 +7,14 @@
NodeRunningTotals,
RegistrationRunningTotals,
)
+from osf.metrics.es8_metrics import (
+ DailyNodeSummaryReportEs8,
+ NodeRunningTotals as NodeRunningTotalsEs8,
+ RegistrationRunningTotals as RegistrationRunningTotalsEs8
+)
+from osf.metrics.utils import cycle_coverage_date
from ._base import DailyReporter
-
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
@@ -35,11 +40,11 @@ def report(self, date):
embargo_v2_query = Q(root__embargo__end_date__date__gt=date)
exclude_spam = ~Q(spam_status__in=[SpamStatus.SPAM, SpamStatus.FLAGGED])
-
- report = NodeSummaryReport(
- report_date=date,
+ reports = []
+ report_es8 = DailyNodeSummaryReportEs8(
+ cycle_coverage=cycle_coverage_date(date),
# Nodes - the number of projects and components
- nodes=NodeRunningTotals(
+ nodes=NodeRunningTotalsEs8(
total=node_qs.count(),
total_excluding_spam=node_qs.filter(exclude_spam).count(),
public=node_qs.filter(public_query).count(),
@@ -50,7 +55,7 @@ def report(self, date):
private_daily=node_qs.filter(private_query & created_today_query).count(),
),
# Projects - the number of top-level only projects
- projects=NodeRunningTotals(
+ projects=NodeRunningTotalsEs8(
total=node_qs.get_roots().count(),
total_excluding_spam=node_qs.get_roots().filter(exclude_spam).count(),
public=node_qs.filter(public_query).get_roots().count(),
@@ -61,7 +66,7 @@ def report(self, date):
private_daily=node_qs.filter(private_query & created_today_query).get_roots().count(),
),
# Registered Nodes - the number of registered projects and components
- registered_nodes=RegistrationRunningTotals(
+ registered_nodes=RegistrationRunningTotalsEs8(
total=registration_qs.count(),
public=registration_qs.filter(public_query).count(),
embargoed=registration_qs.filter(private_query).count(),
@@ -75,7 +80,7 @@ def report(self, date):
),
# Registered Projects - the number of registered top level projects
- registered_projects=RegistrationRunningTotals(
+ registered_projects=RegistrationRunningTotalsEs8(
total=registration_qs.get_roots().count(),
public=registration_qs.filter(public_query).get_roots().count(),
embargoed=registration_qs.filter(private_query).get_roots().count(),
@@ -88,5 +93,58 @@ def report(self, date):
withdrawn_daily=registration_qs.filter(retracted_query & retracted_today_query).get_roots().count(),
),
)
+ reports.append(report_es8)
+ report = NodeSummaryReport(
+ report_date=date,
+ # Nodes - the number of projects and components
+ nodes=NodeRunningTotals(
+ total=report_es8.nodes.total,
+ total_excluding_spam=report_es8.nodes.total_excluding_spam,
+ public=report_es8.nodes.public,
+ private=report_es8.nodes.private,
+ total_daily=report_es8.nodes.total_daily,
+ total_daily_excluding_spam=report_es8.nodes.total_daily_excluding_spam,
+ public_daily=report_es8.nodes.public_daily,
+ private_daily=report_es8.nodes.private_daily,
+ ),
+ # Projects - the number of top-level only projects
+ projects=NodeRunningTotals(
+ total=report_es8.projects.total,
+ total_excluding_spam=report_es8.projects.total_excluding_spam,
+ public=report_es8.projects.public,
+ private=report_es8.projects.private,
+ total_daily=report_es8.projects.total_daily,
+ total_daily_excluding_spam=report_es8.projects.total_daily_excluding_spam,
+ public_daily=report_es8.projects.public_daily,
+ private_daily=report_es8.projects.private_daily,
+ ),
+ # Registered Nodes - the number of registered projects and components
+ registered_nodes=RegistrationRunningTotals(
+ total=report_es8.registered_nodes.total,
+ public=report_es8.registered_nodes.public,
+ embargoed=report_es8.registered_nodes.embargoed,
+ embargoed_v2=report_es8.registered_nodes.embargoed_v2,
+ withdrawn=report_es8.registered_nodes.withdrawn,
+ total_daily=report_es8.registered_nodes.total_daily,
+ public_daily=report_es8.registered_nodes.public_daily,
+ embargoed_daily=report_es8.registered_nodes.embargoed_daily,
+ embargoed_v2_daily=report_es8.registered_nodes.embargoed_v2_daily,
+ withdrawn_daily=report_es8.registered_nodes.withdrawn_daily,
+ ),
+ # Registered Projects - the number of registered top level projects
+ registered_projects=RegistrationRunningTotals(
+ total=report_es8.registered_projects.total,
+ public=report_es8.registered_projects.public,
+ embargoed=report_es8.registered_projects.embargoed,
+ embargoed_v2=report_es8.registered_projects.embargoed_v2,
+ withdrawn=report_es8.registered_projects.withdrawn,
+ total_daily=report_es8.registered_projects.total_daily,
+ public_daily=report_es8.registered_projects.public_daily,
+ embargoed_daily=report_es8.registered_projects.embargoed_daily,
+ embargoed_v2_daily=report_es8.registered_projects.embargoed_v2_daily,
+ withdrawn_daily=report_es8.registered_projects.withdrawn_daily,
+ ),
+ )
+ reports.append(report)
- return [report]
+ return reports
diff --git a/osf/metrics/reporters/osfstorage_file_count.py b/osf/metrics/reporters/osfstorage_file_count.py
index 2f35e1e81fd..6ddeb89945b 100644
--- a/osf/metrics/reporters/osfstorage_file_count.py
+++ b/osf/metrics/reporters/osfstorage_file_count.py
@@ -4,9 +4,13 @@
from osf.metrics.reports import OsfstorageFileCountReport, FileRunningTotals
from osf.models import AbstractNode, Preprint
+from osf.metrics.es8_metrics import (
+ DailyOsfstorageFileCountReportEs8,
+ FileRunningTotals as FileRunningTotalsEs8
+)
+from osf.metrics.utils import cycle_coverage_date
from ._base import DailyReporter
-
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
@@ -32,9 +36,11 @@ def report(self, date):
daily_query = Q(created__date=date)
- report = OsfstorageFileCountReport(
- report_date=date,
- files=FileRunningTotals(
+ reports = []
+
+ report_es8 = DailyOsfstorageFileCountReportEs8(
+ cycle_coverage=cycle_coverage_date(date),
+ files=FileRunningTotalsEs8(
total=file_qs.count(),
public=file_qs.filter(public_query).count(),
private=file_qs.filter(private_query).count(),
@@ -43,5 +49,19 @@ def report(self, date):
private_daily=file_qs.filter(private_query & daily_query).count(),
),
)
+ reports.append(report_es8)
+
+ report = OsfstorageFileCountReport(
+ report_date=date,
+ files=FileRunningTotals(
+ total=report_es8.files.total,
+ public=report_es8.files.public,
+ private=report_es8.files.private,
+ total_daily=report_es8.files.total_daily,
+ public_daily=report_es8.files.public_daily,
+ private_daily=report_es8.files.private_daily,
+ ),
+ )
+ reports.append(report)
- return [report]
+ return reports
diff --git a/osf/metrics/reporters/preprint_count.py b/osf/metrics/reporters/preprint_count.py
index 23f68bc7736..85ba639a32f 100644
--- a/osf/metrics/reporters/preprint_count.py
+++ b/osf/metrics/reporters/preprint_count.py
@@ -3,6 +3,8 @@
from osf.metrics import PreprintSummaryReport
from website import settings
+from osf.metrics.es8_metrics import DailyPreprintSummaryReportEs8
+from osf.metrics.utils import cycle_coverage_date
from ._base import DailyReporter
logger = logging.getLogger(__name__)
@@ -48,13 +50,20 @@ def report(self, date):
for preprint_provider in PreprintProvider.objects.all():
elastic_query = get_elastic_query(date, preprint_provider)
resp = requests.post(f'{settings.SHARE_URL}api/v2/search/creativeworks/_search', json=elastic_query).json()
- reports.append(
- PreprintSummaryReport(
- report_date=date,
- provider_key=preprint_provider._id,
- preprint_count=resp['hits']['total'],
- )
+
+ report_es8 = DailyPreprintSummaryReportEs8(
+ cycle_coverage=cycle_coverage_date(date),
+ provider_key=preprint_provider._id,
+ preprint_count=resp['hits']['total'],
+ )
+ reports.append(report_es8)
+
+ report = PreprintSummaryReport(
+ report_date=date,
+ provider_key=report_es8.provider_key,
+ preprint_count=report_es8.preprint_count,
)
+ reports.append(report)
logger.info('{} Preprints counted for the provider {}'.format(resp['hits']['total'], preprint_provider.name))
return reports
diff --git a/osf/metrics/reporters/private_spam_metrics.py b/osf/metrics/reporters/private_spam_metrics.py
index 40f259af325..fde545247e6 100644
--- a/osf/metrics/reporters/private_spam_metrics.py
+++ b/osf/metrics/reporters/private_spam_metrics.py
@@ -1,8 +1,11 @@
from osf.metrics.reports import PrivateSpamMetricsReport
from osf.external.oopspam.client import OOPSpamClient
from osf.external.askismet.client import AkismetClient
+from osf.metrics.es8_metrics import MonthlyPrivateSpamMetricsReportEs8
+from osf.metrics.utils import cycle_coverage_yearmonth
from ._base import MonthlyReporter
+
class PrivateSpamMetricsReporter(MonthlyReporter):
report_name = 'Private Spam Metrics'
@@ -13,8 +16,10 @@ def report(self):
oopspam_client = OOPSpamClient()
akismet_client = AkismetClient()
- report = PrivateSpamMetricsReport(
- report_yearmonth=str(self.yearmonth),
+ reports = []
+
+ report_es8 = MonthlyPrivateSpamMetricsReportEs8(
+ cycle_coverage=cycle_coverage_yearmonth(self.yearmonth),
node_oopspam_flagged=oopspam_client.get_flagged_count(target_month, next_month, category='node'),
node_oopspam_hammed=oopspam_client.get_hammed_count(target_month, next_month, category='node'),
node_akismet_flagged=akismet_client.get_flagged_count(target_month, next_month, category='node'),
@@ -24,5 +29,19 @@ def report(self):
preprint_akismet_flagged=akismet_client.get_flagged_count(target_month, next_month, category='preprint'),
preprint_akismet_hammed=akismet_client.get_hammed_count(target_month, next_month, category='preprint')
)
+ reports.append(report_es8)
+
+ report = PrivateSpamMetricsReport(
+ report_yearmonth=str(self.yearmonth),
+ node_oopspam_flagged=report_es8.node_oopspam_flagged,
+ node_oopspam_hammed=report_es8.node_oopspam_hammed,
+ node_akismet_flagged=report_es8.node_akismet_flagged,
+ node_akismet_hammed=report_es8.node_akismet_hammed,
+ preprint_oopspam_flagged=report_es8.preprint_oopspam_flagged,
+ preprint_oopspam_hammed=report_es8.preprint_oopspam_hammed,
+ preprint_akismet_flagged=report_es8.preprint_akismet_flagged,
+ preprint_akismet_hammed=report_es8.preprint_akismet_hammed,
+ )
+ reports.append(report)
- return report
+ return reports
diff --git a/osf/metrics/reporters/public_item_usage.py b/osf/metrics/reporters/public_item_usage.py
index cc401d50bd7..085bac38684 100644
--- a/osf/metrics/reporters/public_item_usage.py
+++ b/osf/metrics/reporters/public_item_usage.py
@@ -3,8 +3,11 @@
import typing
import waffle
+
+from osf.metrics.es8_metrics import MonthlyPublicItemUsageReportEs8
+
if typing.TYPE_CHECKING:
- import elasticsearch_dsl as edsl
+ import elasticsearch6_dsl as edsl
import osf.features
from osf.metadata.osf_gathering import OsfmapPartition
@@ -18,7 +21,7 @@
PreprintView,
)
from osf.metrics.reports import PublicItemUsageReport
-from osf.metrics.utils import YearMonth
+from osf.metrics.utils import YearMonth, cycle_coverage_yearmonth
from osf import models as osfdb
from website import settings as website_settings
from ._base import MonthlyReporter
@@ -61,16 +64,17 @@ def report(self, **report_kwargs):
if _guid is None or _guid.referent is None:
raise _SkipItem
_obj = _guid.referent
- _report = self._init_report(_obj)
- self._fill_report_counts(_report, _obj)
- if not any((
- _report.view_count,
- _report.view_session_count,
- _report.download_count,
- _report.download_session_count,
- )):
- raise _SkipItem
- return _report
+ _reports = self._init_report(_obj)
+ for _report in _reports:
+ self._fill_report_counts(_report, _obj)
+ if not any((
+ _report.view_count,
+ _report.view_session_count,
+ _report.download_count,
+ _report.download_session_count,
+ )):
+ raise _SkipItem
+ return _reports
except _SkipItem:
return None
@@ -131,16 +135,27 @@ def _preprintdownload_osfids(self, after_osfid: str | None) -> typing.Iterator[s
)
return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid)
- def _init_report(self, osf_obj) -> PublicItemUsageReport:
+ def _init_report(self, osf_obj) -> typing.List[PublicItemUsageReport | MonthlyPublicItemUsageReportEs8]:
if not _is_item_public(osf_obj):
raise _SkipItem
- return PublicItemUsageReport(
+ reports = []
+ report_es8 = MonthlyPublicItemUsageReportEs8(
+ cycle_coverage=cycle_coverage_yearmonth(self.yearmonth),
item_osfid=osf_obj._id,
item_type=[get_item_type(osf_obj)],
provider_id=[get_provider_id(osf_obj)],
platform_iri=[website_settings.DOMAIN],
+ )
+ reports.append(report_es8)
+ report = PublicItemUsageReport(
+ item_osfid=report_es8.item_osfid,
+ item_type=report_es8.item_type,
+ provider_id=report_es8.provider_id,
+ platform_iri=report_es8.platform_iri,
# leave counts null; will be set if there's data
)
+ reports.append(report)
+ return reports
def _fill_report_counts(self, report, osf_obj):
if (
@@ -154,31 +169,43 @@ def _fill_report_counts(self, report, osf_obj):
(
report.view_count,
report.view_session_count,
- ) = self._countedusage_view_counts(osf_obj)
+ ) = self._countedusage_view_counts(osf_obj, cumulative=False)
(
report.download_count,
report.download_session_count,
- ) = self._countedusage_download_counts(osf_obj)
+ ) = self._countedusage_download_counts(osf_obj, cumulative=False)
+
+ (
+ report.cumulative_view_count,
+ report.cumulative_view_session_count,
+ ) = self._countedusage_view_counts(osf_obj, cumulative=True)
- def _base_usage_search(self):
+ (
+ report.cumulative_download_count,
+ report.cumulative_download_session_count,
+ ) = self._countedusage_download_counts(osf_obj, cumulative=True)
+
+ def _base_usage_search(self, cumulative: bool = False):
+ timestamp_filter = {
+ 'lt': self.yearmonth.month_end(),
+ }
+ if not cumulative:
+ timestamp_filter['gte'] = self.yearmonth.month_start()
return (
CountedAuthUsage.search()
.filter('term', item_public=True)
- .filter('range', timestamp={
- 'gte': self.yearmonth.month_start(),
- 'lt': self.yearmonth.month_end(),
- })
+ .filter('range', timestamp=timestamp_filter)
.extra(size=0) # only aggregations, no hits
)
- def _countedusage_view_counts(self, osf_obj) -> tuple[int, int]:
+ def _countedusage_view_counts(self, osf_obj, cumulative: bool = False) -> tuple[int, int]:
'''compute view_session_count separately to avoid double-counting
(the same session may be represented in both the composite agg on `item_guid`
and that on `surrounding_guids`)
'''
_search = (
- self._base_usage_search()
+ self._base_usage_search(cumulative=cumulative)
.query(
'bool',
filter=[
@@ -206,10 +233,10 @@ def _countedusage_view_counts(self, osf_obj) -> tuple[int, int]:
)
return (_view_count, _view_session_count)
- def _countedusage_download_counts(self, osf_obj) -> tuple[int, int]:
+ def _countedusage_download_counts(self, osf_obj, cumulative: bool = False) -> tuple[int, int]:
'''aggregate downloads on each osfid (not including components/files)'''
_search = (
- self._base_usage_search()
+ self._base_usage_search(cumulative=cumulative)
.filter('term', item_guid=osf_obj._id)
.filter('term', action_labels=CountedAuthUsage.ActionLabel.DOWNLOAD.value)
)
diff --git a/osf/metrics/reporters/spam_count.py b/osf/metrics/reporters/spam_count.py
index 319381fe899..2fbac671ad1 100644
--- a/osf/metrics/reporters/spam_count.py
+++ b/osf/metrics/reporters/spam_count.py
@@ -1,9 +1,11 @@
from osf.models import OSFUser
from osf.metrics.reports import SpamSummaryReport
-from ._base import MonthlyReporter
from osf.models import PreprintLog, NodeLog
from osf.models.spam import SpamStatus
+from osf.metrics.es8_metrics import MonthlySpamSummaryReportEs8
+from osf.metrics.utils import cycle_coverage_yearmonth
+from ._base import MonthlyReporter
class SpamCountReporter(MonthlyReporter):
@@ -11,9 +13,9 @@ def report(self, **report_kwargs):
assert not report_kwargs
target_month = self.yearmonth.month_start()
next_month = self.yearmonth.month_end()
-
- return SpamSummaryReport(
- # Node Log entries
+ reports = []
+ report_es8 = MonthlySpamSummaryReportEs8(
+ cycle_coverage=cycle_coverage_yearmonth(self.yearmonth),
node_confirmed_spam=NodeLog.objects.filter(
action=NodeLog.CONFIRM_SPAM,
created__gt=target_month,
@@ -79,3 +81,23 @@ def report(self, **report_kwargs):
created__lt=next_month,
).count()
)
+ reports.append(report_es8)
+ report = SpamSummaryReport(
+ # Node Log entries
+ node_confirmed_spam=report_es8.node_confirmed_spam,
+ node_confirmed_ham=report_es8.node_confirmed_ham,
+ node_flagged=report_es8.node_flagged,
+ # Registration Log entries
+ registration_confirmed_spam=report_es8.registration_confirmed_spam,
+ registration_confirmed_ham=report_es8.registration_confirmed_ham,
+ registration_flagged=report_es8.registration_flagged,
+ # Preprint Log entries
+ preprint_confirmed_spam=report_es8.preprint_confirmed_spam,
+ preprint_confirmed_ham=report_es8.preprint_confirmed_ham,
+ preprint_flagged=report_es8.preprint_flagged,
+ # New Users marked as Spam/Ham
+ user_marked_as_spam=report_es8.user_marked_as_spam,
+ user_marked_as_ham=report_es8.user_marked_as_ham,
+ )
+ reports.append(report)
+ return reports
diff --git a/osf/metrics/reporters/storage_addon_usage.py b/osf/metrics/reporters/storage_addon_usage.py
index 704254795f0..893373cebd1 100644
--- a/osf/metrics/reporters/storage_addon_usage.py
+++ b/osf/metrics/reporters/storage_addon_usage.py
@@ -13,6 +13,12 @@
from osf.metrics.reports import StorageAddonUsage, RunningTotal, UsageByStorageAddon
from osf.models import SpamStatus, Tag
from website import settings
+from osf.metrics.es8_metrics import (
+ DailyStorageAddonUsageReportEs8,
+ UsageByStorageAddon as UsageByStorageAddonEs8,
+ RunningTotal as RunningTotalEs8
+)
+from osf.metrics.utils import cycle_coverage_date
from ._base import DailyReporter
logger = logging.getLogger(__name__)
@@ -125,45 +131,89 @@ def report(self, date):
if 'storage' in addon_config.categories
}
+ usage_by_addon_es8 = []
usage_by_addon = []
for short_name, addon_config in storage_addon_configs.items():
- user_counts = storage_addon_user_counts(date, addon_config.get_model('UserSettings'))
- node_counts = storage_addon_node_counts(date, addon_config.get_model('NodeSettings'))
-
- usage_by_addon.append(
- UsageByStorageAddon(
- addon_shortname=short_name,
- enabled_usersettings=RunningTotal(
- total=user_counts.get('enabled_total', 0),
- total_daily=user_counts.get('enabled_daily', 0),
- ),
- deleted_usersettings=RunningTotal(
- total=user_counts.get('deleted_total', 0),
- total_daily=user_counts.get('deleted_daily', 0),
- ),
- linked_usersettings=RunningTotal(
- total=user_counts.get('linked_total', 0),
- total_daily=user_counts.get('linked_daily', 0),
- ),
- usersetting_links=RunningTotal(
- total=user_counts.get('link_count_total', 0),
- total_daily=user_counts.get('link_count_daily', 0),
- ),
- connected_nodesettings=RunningTotal(
- total=node_counts.get('connected_total', 0),
- total_daily=node_counts.get('connected_daily', 0),
- ),
- disconnected_nodesettings=RunningTotal(
- total=node_counts.get('disconnected_total', 0),
- total_daily=node_counts.get('disconnected_daily', 0),
- ),
- deleted_nodesettings=RunningTotal(
- total=node_counts.get('deleted_total', 0),
- total_daily=node_counts.get('deleted_daily', 0),
- ),
+ try:
+ _UserSettings = addon_config.get_model('UserSettings')
+ _NodeSettings = addon_config.get_model('NodeSettings')
+ except LookupError:
+ continue
+ user_counts = storage_addon_user_counts(date, _UserSettings)
+ node_counts = storage_addon_node_counts(date, _NodeSettings)
+ usage_by_storage_addon_es_8 = UsageByStorageAddonEs8(
+ addon_shortname=short_name,
+ enabled_usersettings=RunningTotalEs8(
+ total=user_counts.get('enabled_total', 0),
+ total_daily=user_counts.get('enabled_daily', 0),
+ ),
+ deleted_usersettings=RunningTotalEs8(
+ total=user_counts.get('deleted_total', 0),
+ total_daily=user_counts.get('deleted_daily', 0),
+ ),
+ linked_usersettings=RunningTotalEs8(
+ total=user_counts.get('linked_total', 0),
+ total_daily=user_counts.get('linked_daily', 0),
+ ),
+ usersetting_links=RunningTotalEs8(
+ total=user_counts.get('link_count_total', 0),
+ total_daily=user_counts.get('link_count_daily', 0),
+ ),
+ connected_nodesettings=RunningTotalEs8(
+ total=node_counts.get('connected_total', 0),
+ total_daily=node_counts.get('connected_daily', 0),
+ ),
+ disconnected_nodesettings=RunningTotalEs8(
+ total=node_counts.get('disconnected_total', 0),
+ total_daily=node_counts.get('disconnected_daily', 0),
+ ),
+ deleted_nodesettings=RunningTotalEs8(
+ total=node_counts.get('deleted_total', 0),
+ total_daily=node_counts.get('deleted_daily', 0),
+ ),
+ )
+ usage_by_addon_es8.append(usage_by_storage_addon_es_8)
+ usage_by_storage_addon = UsageByStorageAddon(
+ addon_shortname=usage_by_storage_addon_es_8.addon_shortname,
+ enabled_usersettings=RunningTotal(
+ total=usage_by_storage_addon_es_8.enabled_usersettings.total,
+ total_daily=usage_by_storage_addon_es_8.enabled_usersettings.total_daily,
+ ),
+ deleted_usersettings=RunningTotal(
+ total=usage_by_storage_addon_es_8.deleted_usersettings.total,
+ total_daily=usage_by_storage_addon_es_8.deleted_usersettings.total_daily,
+ ),
+ linked_usersettings=RunningTotal(
+ total=usage_by_storage_addon_es_8.linked_usersettings.total,
+ total_daily=usage_by_storage_addon_es_8.linked_usersettings.total_daily,
+ ),
+ usersetting_links=RunningTotal(
+ total=usage_by_storage_addon_es_8.usersetting_links.total,
+ total_daily=usage_by_storage_addon_es_8.usersetting_links.total_daily,
+ ),
+ connected_nodesettings=RunningTotal(
+ total=usage_by_storage_addon_es_8.connected_nodesettings.total,
+ total_daily=usage_by_storage_addon_es_8.connected_nodesettings.total_daily,
+ ),
+ disconnected_nodesettings=RunningTotal(
+ total=usage_by_storage_addon_es_8.disconnected_nodesettings.total,
+ total_daily=usage_by_storage_addon_es_8.disconnected_nodesettings.total_daily,
+ ),
+ deleted_nodesettings=RunningTotal(
+ total=usage_by_storage_addon_es_8.deleted_nodesettings.total,
+ total_daily=usage_by_storage_addon_es_8.deleted_nodesettings.total_daily,
)
)
- return [StorageAddonUsage(
+ usage_by_addon.append(usage_by_storage_addon)
+ reports = []
+ report_es8 = DailyStorageAddonUsageReportEs8(
+ cycle_coverage=cycle_coverage_date(date),
+ usage_by_addon=usage_by_addon,
+ )
+ reports.append(report_es8)
+ report = StorageAddonUsage(
report_date=date,
usage_by_addon=usage_by_addon,
- )]
+ )
+ reports.append(report)
+ return reports
diff --git a/osf/metrics/reporters/user_count.py b/osf/metrics/reporters/user_count.py
index e0a61c7bb10..121b830c466 100644
--- a/osf/metrics/reporters/user_count.py
+++ b/osf/metrics/reporters/user_count.py
@@ -1,14 +1,17 @@
from osf.models import OSFUser
from osf.metrics import UserSummaryReport
+from osf.metrics.es8_metrics import DailyUserSummaryReportEs8
+from osf.metrics.utils import cycle_coverage_date
from ._base import DailyReporter
class UserCountReporter(DailyReporter):
def report(self, report_date):
- report = UserSummaryReport(
- report_date=report_date,
+ reports = []
+ report_es8 = DailyUserSummaryReportEs8(
+ cycle_coverage=cycle_coverage_date(report_date),
active=OSFUser.objects.filter(is_active=True, date_confirmed__date__lte=report_date).count(),
deactivated=OSFUser.objects.filter(date_disabled__isnull=False, date_disabled__date__lte=report_date).count(),
merged=OSFUser.objects.filter(date_registered__date__lte=report_date, merged_by__isnull=False).count(),
@@ -16,5 +19,16 @@ def report(self, report_date):
new_users_with_institution_daily=OSFUser.objects.filter(is_active=True, date_confirmed__date=report_date, institutionaffiliation__isnull=False).count(),
unconfirmed=OSFUser.objects.filter(date_registered__date__lte=report_date, date_confirmed__isnull=True).count(),
)
+ reports.append(report_es8)
+ report = UserSummaryReport(
+ report_date=report_date,
+ active=report_es8.active,
+ deactivated=report_es8.deactivated,
+ merged=report_es8.merged,
+ new_users_daily=report_es8.new_users_daily,
+ new_users_with_institution_daily=report_es8.new_users_with_institution_daily,
+ unconfirmed=report_es8.unconfirmed,
+ )
+ reports.append(report)
- return [report]
+ return reports
diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py
index ffbcfb4c9b8..62479e359cd 100644
--- a/osf/metrics/reports.py
+++ b/osf/metrics/reports.py
@@ -4,7 +4,7 @@
from django.dispatch import receiver
from elasticsearch6_dsl import InnerDoc
-from elasticsearch_metrics import metrics
+import elasticsearch_metrics.imps.elastic6 as metrics
from elasticsearch_metrics.signals import pre_save as metrics_pre_save
from osf.metrics.utils import stable_key, YearMonth
@@ -120,6 +120,8 @@ def save(self, *args, **kwargs):
@receiver(metrics_pre_save)
def set_report_id(sender, instance, **kwargs):
+ if not issubclass(sender, metrics.Metric):
+ return # skip es8 record types
try:
_unique_together_fields = instance.UNIQUE_TOGETHER_FIELDS
except AttributeError:
diff --git a/osf/metrics/utils.py b/osf/metrics/utils.py
index 973b8bf1ef3..c5d49f293cf 100644
--- a/osf/metrics/utils.py
+++ b/osf/metrics/utils.py
@@ -6,6 +6,28 @@
from hashlib import sha256
from typing import ClassVar
+from elasticsearch_metrics.util.timeparts import format_timeparts
+
+
+def cycle_coverage_date(given_date: datetime.date) -> str:
+ """
+ >>> cycle_coverage_date(datetime.date(1234, 5, 6))
+ '1234.5.6'
+ >>> cycle_coverage_date(datetime.datetime(7654, 3, 2, 1))
+ '7654.3.2'
+ """
+ return format_timeparts(given_date, 3)
+
+
+def cycle_coverage_yearmonth(given_ym: YearMonth | datetime.date) -> str:
+ """
+ >>> cycle_coverage_yearmonth(YearMonth(2222, 33))
+ '2222.33'
+ >>> cycle_coverage_yearmonth(datetime.date(1234, 5, 6))
+ '1234.5'
+ """
+ return format_timeparts((given_ym.year, given_ym.month), 2)
+
def stable_key(*key_parts):
"""hash function for use in osf.metrics
diff --git a/osf/models/registrations.py b/osf/models/registrations.py
index e1d819b43bf..f13489f1201 100644
--- a/osf/models/registrations.py
+++ b/osf/models/registrations.py
@@ -14,15 +14,18 @@
UserObjectPermissionBase,
)
from dirtyfields import DirtyFieldsMixin
+import waffle
from framework.auth import Auth
from framework.exceptions import PermissionsError
+from osf import features
from osf.models import Identifier
from osf.utils.fields import NonNaiveDateTimeField, LowercaseCharField
from osf.utils.permissions import ADMIN, READ, WRITE
from osf.exceptions import NodeStateError, DraftRegistrationStateError
from osf.external.internet_archive.tasks import archive_to_ia, update_ia_metadata
from osf.metrics import RegistriesModerationMetrics
+from osf.metrics.es8_metrics import RegistriesModerationEventEs8
from osf.models.notification_type import NotificationTypeEnum
from .action import RegistrationAction
from .archive import ArchiveJob
@@ -782,7 +785,17 @@ def _write_registration_action(self, from_state, to_state, initiated_by, comment
comment=comment
)
action.save()
- RegistriesModerationMetrics.record_transitions(action)
+ if waffle.switch_is_active(features.ELASTICSEARCH_METRICS):
+ RegistriesModerationMetrics.record_transitions(action)
+ RegistriesModerationEventEs8.record(
+ registration_id=action.target._id,
+ provider_id=action.target.provider._id,
+ from_state=action.from_state,
+ to_state=action.to_state,
+ trigger=action.trigger,
+ user_id=action.creator._id,
+ comment=action.comment,
+ )
moderation_notifications = {
RegistrationModerationTriggers.SUBMIT: notify.notify_submit,
diff --git a/osf_tests/management_commands/test_reindex_es6.py b/osf_tests/management_commands/test_reindex_es6.py
index 5e01be656a8..36158c18da6 100644
--- a/osf_tests/management_commands/test_reindex_es6.py
+++ b/osf_tests/management_commands/test_reindex_es6.py
@@ -10,7 +10,7 @@
AuthUserFactory
)
-from elasticsearch_metrics.field import Keyword
+from elasticsearch6_dsl import Keyword
from tests.json_api_test_app import JSONAPITestApp
diff --git a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py
index 05baa4d38e7..f40b5dacec6 100644
--- a/osf_tests/metrics/reporters/test_institutional_summary_reporter.py
+++ b/osf_tests/metrics/reporters/test_institutional_summary_reporter.py
@@ -3,6 +3,7 @@
import logging
from django.test import TestCase
from osf.metrics.reporters import InstitutionalSummaryMonthlyReporter
+from osf.metrics.reports import InstitutionMonthlySummaryReport
from osf.metrics.utils import YearMonth
from osf_tests.factories import (
InstitutionFactory,
@@ -79,10 +80,10 @@ def _create_active_user(cls, institution, date_confirmed):
def test_report_generation(self):
reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth)
- reports = list_monthly_reports(reporter)
- self.assertEqual(len(reports), 1)
+ reports_raw = list_monthly_reports(reporter)
+ self.assertEqual(len(reports_raw[0]), 2)
- report = reports[0]
+ report = next(r for r in reports_raw[0] if isinstance(r, InstitutionMonthlySummaryReport))
self.assertEqual(report.institution_id, self._institution._id)
self.assertEqual(report.user_count, 2) # _logged_in_user and _active_user
self.assertEqual(report.public_project_count, 1)
@@ -115,7 +116,8 @@ def test_report_generation_multiple_institutions(self):
# Run the reporter for the current month (February 2018)
reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth)
- reports = list_monthly_reports(reporter)
+ reports_raw = list_monthly_reports(reporter)
+ reports = [item for sublist in reports_raw for item in sublist if isinstance(item, InstitutionMonthlySummaryReport)]
self.assertEqual(len(reports), 3) # Reports for self._institution, institution2, institution3
# Extract reports by institution
@@ -264,7 +266,8 @@ def test_high_counts_multiple_institutions(self):
if enable_benchmarking:
reporter_start_time = time.time()
reporter = InstitutionalSummaryMonthlyReporter(self._yearmonth)
- reports = list_monthly_reports(reporter)
+ reports_raw = list_monthly_reports(reporter)
+ reports = [item for sublist in reports_raw for item in sublist if isinstance(item, InstitutionMonthlySummaryReport)]
assert len(reports) == additional_institution_count + 1
if enable_benchmarking:
diff --git a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py
index 69bd266285a..082b330afd8 100644
--- a/osf_tests/metrics/reporters/test_public_item_usage_reporter.py
+++ b/osf_tests/metrics/reporters/test_public_item_usage_reporter.py
@@ -174,8 +174,10 @@ def test_no_data(self, ym_empty):
def test_reporter(self, ym_empty, ym_sparse, ym_busy, sparse_month_usage, busy_month_item0, busy_month_item1, busy_month_item2, item0):
_empty = list_monthly_reports(PublicItemUsageReporter(ym_empty))
- _sparse = list_monthly_reports(PublicItemUsageReporter(ym_sparse))
- _busy = list_monthly_reports(PublicItemUsageReporter(ym_busy))
+ _sparse_raw = list_monthly_reports(PublicItemUsageReporter(ym_sparse))
+ _sparse = [item for sublist in _sparse_raw for item in sublist if isinstance(item, PublicItemUsageReport)]
+ _busy_raw = list_monthly_reports(PublicItemUsageReporter(ym_busy))
+ _busy = [item for sublist in _busy_raw for item in sublist if isinstance(item, PublicItemUsageReport)]
# empty month:
assert _empty == []
diff --git a/osf_tests/metrics/test_daily_report.py b/osf_tests/metrics/test_daily_report.py
index 46375184f95..5228e2342c5 100644
--- a/osf_tests/metrics/test_daily_report.py
+++ b/osf_tests/metrics/test_daily_report.py
@@ -2,7 +2,7 @@
from unittest import mock
import pytest
-from elasticsearch_metrics import metrics
+import elasticsearch_metrics.imps.elastic6 as metrics
from osf.metrics.reports import DailyReport, ReportInvalid
@@ -10,8 +10,9 @@
class TestDailyReportKey:
@pytest.fixture
def mock_save(self):
- with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save:
- yield mock_save
+ with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'):
+ with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save:
+ yield mock_save
def test_default(self, mock_save):
# only one of this type of report per day
diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py
new file mode 100644
index 00000000000..5bc6e4c4bc4
--- /dev/null
+++ b/osf_tests/metrics/test_es8_metrics.py
@@ -0,0 +1,92 @@
+import datetime
+
+from elasticsearch_metrics.tests.util import djelme_test_backends
+import pytest
+
+from osf.metrics.es8_metrics import (
+ PageviewInfo,
+ DailyDownloadCountReportEs8,
+ OsfCountedUsageEvent,
+)
+
+
+class TestEs8Metrics:
+ """smoke tests to check that djelme records can be saved and searched"""
+
+ @pytest.fixture(autouse=True)
+ def _real_elastic(self):
+ with djelme_test_backends():
+ yield
+
+ def test_nested_pageview_autofill(self):
+ usage = OsfCountedUsageEvent.record(
+ timestamp=datetime.datetime(2024, 1, 1, 15, 0, tzinfo=datetime.UTC),
+ sessionhour_id='blah',
+ database_iri='https://osf.example/provider',
+ item_iri='https://osf.example/itemm',
+ item_osfid='itemm',
+ item_public=True,
+ item_type='https://osf.example/Preprint',
+ platform_iri='https://osf.example',
+ user_is_authenticated=False,
+ pageview_info=PageviewInfo(
+ page_url='https://example.com/path/test',
+ referer_url='https://google.com',
+ route_name='foo.bar',
+ page_title='title title',
+ ),
+ )
+ assert usage.pageview_info.page_path == '/path/test'
+ assert usage.pageview_info.referer_domain == 'google.com'
+ assert usage.pageview_info.hour_of_day == 15
+ assert usage.item_iri in usage.within_iris
+
+ def test_nested_pageview_autofill_dict(self):
+ usage = OsfCountedUsageEvent.record(
+ timestamp=datetime.datetime(2024, 1, 1, 15, 0, tzinfo=datetime.UTC),
+ sessionhour_id='blah',
+ database_iri='https://osf.example/provider',
+ item_iri='https://osf.example/itemm',
+ item_osfid='itemm',
+ item_public=True,
+ item_type='https://osf.example/Preprint',
+ platform_iri='https://osf.example',
+ user_is_authenticated=False,
+ pageview_info={
+ 'page_url': 'https://example.com/path/test',
+ 'referer_url': 'https://google.com',
+ 'route_name': 'foo.bar',
+ 'page_title': 'title title',
+ },
+ )
+ assert usage.pageview_info.page_path == '/path/test'
+ assert usage.pageview_info.referer_domain == 'google.com'
+ assert usage.pageview_info.hour_of_day == 15
+ assert usage.item_iri in usage.within_iris
+
+ def test_none_pageview_nested_autofill(self):
+ usage = OsfCountedUsageEvent.record(
+ timestamp=datetime.datetime(2024, 1, 1, 15, 0, tzinfo=datetime.UTC),
+ sessionhour_id='blah',
+ database_iri='https://osf.example/provider',
+ item_iri='https://osf.example/itemm',
+ item_osfid='itemm',
+ item_public=True,
+ item_type='https://osf.example/Preprint',
+ platform_iri='https://osf.example',
+ user_is_authenticated=False,
+ )
+ assert not usage.pageview_info
+ assert usage.item_iri in usage.within_iris
+
+ def test_save_report(self):
+ _saved = DailyDownloadCountReportEs8.record(
+ cycle_coverage='2026.1.1',
+ daily_file_downloads=17,
+ )
+ DailyDownloadCountReportEs8.refresh()
+ _response = DailyDownloadCountReportEs8.search().execute()
+ (_fetched,) = _response
+ assert _fetched.meta.id == _saved.meta.id
+ assert _fetched.cycle_coverage == '2026.1.1'
+ assert _fetched.daily_file_downloads == 17
diff --git a/osf_tests/metrics/test_metric_mixin.py b/osf_tests/metrics/test_metric_mixin.py
index 4a2c32f7e71..ec9b2d302de 100644
--- a/osf_tests/metrics/test_metric_mixin.py
+++ b/osf_tests/metrics/test_metric_mixin.py
@@ -1,6 +1,6 @@
from unittest import mock
import pytest
-from elasticsearch_metrics import metrics
+import elasticsearch_metrics.imps.elastic6 as metrics
from osf.metrics.metric_mixin import MetricMixin
from osf.models import OSFUser
diff --git a/osf_tests/metrics/test_monthly_report.py b/osf_tests/metrics/test_monthly_report.py
index 3c841e6555c..ba981e997d6 100644
--- a/osf_tests/metrics/test_monthly_report.py
+++ b/osf_tests/metrics/test_monthly_report.py
@@ -2,7 +2,7 @@
from unittest import mock
import pytest
-from elasticsearch_metrics import metrics
+import elasticsearch_metrics.imps.elastic6 as metrics
from osf.metrics.reports import MonthlyReport, ReportInvalid, PublicItemUsageReport
from osf.metrics.utils import YearMonth
@@ -11,8 +11,9 @@
class TestMonthlyReportKey:
@pytest.fixture
def mock_save(self):
- with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save:
- yield mock_save
+ with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'):
+ with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save:
+ yield mock_save
def test_default(self, mock_save):
# only one of this type of report per month
@@ -79,6 +80,7 @@ class Meta:
@pytest.mark.es_metrics
+@pytest.mark.django_db
class TestLastMonthReport:
@pytest.fixture
def osfid(self):
diff --git a/osf_tests/metrics/test_spam_count_reporter.py b/osf_tests/metrics/test_spam_count_reporter.py
index 0e7ba6956bf..448a8136f7a 100644
--- a/osf_tests/metrics/test_spam_count_reporter.py
+++ b/osf_tests/metrics/test_spam_count_reporter.py
@@ -1,6 +1,7 @@
import pytest
from datetime import datetime
from osf.metrics.reporters.private_spam_metrics import PrivateSpamMetricsReporter
+from osf.metrics.reports import PrivateSpamMetricsReport
from osf.metrics.utils import YearMonth
from osf_tests.factories import NodeLogFactory, NodeFactory
from unittest.mock import patch
@@ -30,7 +31,8 @@ def test_private_spam_metrics_reporter():
mock_akismet_get_hammed_count.return_value = 10
reporter = PrivateSpamMetricsReporter(report_yearmonth)
- report = reporter.report()
+ reports_raw = reporter.report()
+ report = next(r for r in reports_raw if isinstance(r, PrivateSpamMetricsReport))
assert report.node_oopspam_flagged == 10, f"Expected 10, got {report.node_oopspam_flagged}"
assert report.node_oopspam_hammed == 5, f"Expected 5, got {report.node_oopspam_hammed}"
diff --git a/osf_tests/metrics/test_utils.py b/osf_tests/metrics/test_utils.py
index a9d312f2331..47f16be6404 100644
--- a/osf_tests/metrics/test_utils.py
+++ b/osf_tests/metrics/test_utils.py
@@ -1,15 +1,20 @@
-from datetime import date
+import datetime
import pytest
-from osf.metrics.utils import stable_key
+from osf.metrics.utils import (
+ stable_key,
+ cycle_coverage_date,
+ cycle_coverage_yearmonth,
+ YearMonth,
+)
class TestStableKey:
@pytest.mark.parametrize('args, expected_key', [
(['foo'], '2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae'),
- ([date(1953, 7, 2)], '3943be98daa91031ee7d0e0765472ce1b4a50a21f8c6dcd31047d530a50ada93'),
- (['floo', 'blar', date(3049, 2, 2)], '853cef24d58fa8cd69b20d7dfbcdbd33f20ccda1a14f57e25e43c2533504b64f'),
+ ([datetime.date(1953, 7, 2)], '3943be98daa91031ee7d0e0765472ce1b4a50a21f8c6dcd31047d530a50ada93'),
+ (['floo', 'blar', datetime.date(3049, 2, 2)], '853cef24d58fa8cd69b20d7dfbcdbd33f20ccda1a14f57e25e43c2533504b64f'),
([1, 2, 7.3], '6ab892f8109fd23b03ab24aebc4e343ed2a058d9a72f750bf90ba051627d233e'),
])
def test_successes(self, args, expected_key):
@@ -24,3 +29,13 @@ def test_successes(self, args, expected_key):
def test_value_errors(self, args):
with pytest.raises(ValueError):
stable_key(*args)
+
+
+def test_cycle_coverage_date():
+ assert cycle_coverage_date(datetime.date(1234, 5, 6)) == '1234.5.6'
+ assert cycle_coverage_date(datetime.datetime(7654, 3, 2, 1)) == '7654.3.2'
+
+
+def test_cycle_coverage_yearmonth():
+ assert cycle_coverage_yearmonth(YearMonth(2222, 33)) == '2222.33'
+ assert cycle_coverage_yearmonth(datetime.date(1234, 5, 6)) == '1234.5'
diff --git a/poetry.lock b/poetry.lock
index bfcd4c5766f..5648455ccbe 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.3.2 and should not be changed by hand.
[[package]]
name = "amqp"
@@ -1085,27 +1085,24 @@ Django = ">=2.0"
[[package]]
name = "django-elasticsearch-metrics"
-version = "2022.0.6"
+version = "2026.0.4"
description = "Django app for storing time-series metrics in Elasticsearch."
optional = false
-python-versions = "*"
+python-versions = ">=3.10,<4"
groups = ["main"]
files = []
develop = false
-[package.dependencies]
-elasticsearch6-dsl = ">=6.3.0,<7.0.0"
-
[package.extras]
-dev = ["factory-boy (==2.11.1)", "flake8 (==5.0.4)", "flake8-bugbear (==18.8.0)", "konch (>=3.0.0)", "mock", "pre-commit (==2.17.0)", "pytest", "pytest-django (==3.10.0)", "tox"]
-lint = ["flake8 (==5.0.4)", "flake8-bugbear (==18.8.0)", "pre-commit (==2.17.0)"]
-tests = ["factory-boy (==2.11.1)", "mock", "pytest", "pytest-django (==3.10.0)"]
+anydjango = ["django"]
+elastic6 = ["elasticsearch6-dsl (>=6.3.0,<7.0.0)"]
+elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"]
[package.source]
type = "git"
url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git"
-reference = "f5b9312914154e213aa01731e934c593e3434269"
-resolved_reference = "f5b9312914154e213aa01731e934c593e3434269"
+reference = "f2b92e5509389bb6c33f5a90c9ca4fe4e68187e2"
+resolved_reference = "f2b92e5509389bb6c33f5a90c9ca4fe4e68187e2"
[[package]]
name = "django-extensions"
@@ -1189,7 +1186,7 @@ files = [
[package.dependencies]
autopep8 = "*"
Django = ">=3.2"
-gprof2dot = ">=2017.09.19"
+gprof2dot = ">=2017.9.19"
sqlparse = "*"
[[package]]
@@ -1361,14 +1358,14 @@ stone = ">=2"
[[package]]
name = "elastic-transport"
-version = "8.13.0"
+version = "8.17.1"
description = "Transport classes and utilities shared among Python Elastic client libraries"
optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
groups = ["main"]
files = [
- {file = "elastic-transport-8.13.0.tar.gz", hash = "sha256:2410ec1ff51221e8b3a01c0afa9f0d0498e1386a269283801f5c12f98e42dc45"},
- {file = "elastic_transport-8.13.0-py3-none-any.whl", hash = "sha256:aec890afdddd057762b27ff3553b0be8fa4673ec1a4fd922dfbd00325874bb3d"},
+ {file = "elastic_transport-8.17.1-py3-none-any.whl", hash = "sha256:192718f498f1d10c5e9aa8b9cf32aed405e469a7f0e9d6a8923431dbb2c59fb8"},
+ {file = "elastic_transport-8.17.1.tar.gz", hash = "sha256:5edef32ac864dca8e2f0a613ef63491ee8d6b8cfb52881fa7313ba9290cac6d2"},
]
[package.dependencies]
@@ -1376,46 +1373,7 @@ certifi = "*"
urllib3 = ">=1.26.2,<3"
[package.extras]
-develop = ["aiohttp", "furo", "httpx", "mock", "opentelemetry-api", "opentelemetry-sdk", "orjson", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "respx", "sphinx (>2)", "sphinx-autodoc-typehints", "trustme"]
-
-[[package]]
-name = "elasticsearch"
-version = "6.8.2"
-description = "Python client for Elasticsearch"
-optional = false
-python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, <4"
-groups = ["main"]
-files = [
- {file = "elasticsearch-6.8.2-py2.py3-none-any.whl", hash = "sha256:1aedf00b73f5d1e77cb4df70fec58f2efb664be4ce2686374239aa6c0373c65c"},
- {file = "elasticsearch-6.8.2.tar.gz", hash = "sha256:c3a560bb83e4981b5a5c82080d2ceb99686d33692ef53365656129478aa5ddb2"},
-]
-
-[package.dependencies]
-urllib3 = ">=1.21.1"
-
-[package.extras]
-develop = ["coverage", "mock", "nose", "nosexcover", "numpy", "pandas", "pyyaml", "requests (>=2.0.0,<3.0.0)", "sphinx (<1.7)", "sphinx-rtd-theme"]
-requests = ["requests (>=2.4.0,<3.0.0)"]
-
-[[package]]
-name = "elasticsearch-dsl"
-version = "6.4.0"
-description = "Python client for Elasticsearch"
-optional = false
-python-versions = "*"
-groups = ["main"]
-files = [
- {file = "elasticsearch-dsl-6.4.0.tar.gz", hash = "sha256:26416f4dd46ceca43d62ef74970d9de4bdd6f4b0f163316f0b432c9e61a08bec"},
- {file = "elasticsearch_dsl-6.4.0-py2.py3-none-any.whl", hash = "sha256:f60aea7fd756ac1fbe7ce114bbf4949aefbf495dfe8896640e787c67344f12f6"},
-]
-
-[package.dependencies]
-elasticsearch = ">=6.0.0,<7.0.0"
-python-dateutil = "*"
-six = "*"
-
-[package.extras]
-develop = ["coverage (<5.0.0)", "mock", "pytest (>=3.0.0)", "pytest-cov", "pytz", "sphinx", "sphinx-rtd-theme"]
+develop = ["aiohttp", "furo", "httpx", "opentelemetry-api", "opentelemetry-sdk", "orjson", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "respx", "sphinx (>2)", "sphinx-autodoc-typehints", "trustme"]
[[package]]
name = "elasticsearch2"
@@ -1471,6 +1429,32 @@ six = "*"
[package.extras]
develop = ["coverage (<5.0.0)", "mock", "pytest (>=3.0.0)", "pytest-cov", "pytz", "sphinx", "sphinx-rtd-theme"]
+[[package]]
+name = "elasticsearch8"
+version = "8.19.3"
+description = "Python client for Elasticsearch"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+ {file = "elasticsearch8-8.19.3-py3-none-any.whl", hash = "sha256:4b52e59e68aea6f59bf37c28f6f4512333302dd8a52e26c17d0f10c076d833a1"},
+ {file = "elasticsearch8-8.19.3.tar.gz", hash = "sha256:7effe95b360241b6d56ef68219037a90ad0f56723614db54bbe57d33058402f4"},
+]
+
+[package.dependencies]
+elastic-transport = ">=8.15.1,<9"
+python-dateutil = "*"
+typing-extensions = "*"
+
+[package.extras]
+async = ["aiohttp (>=3,<4)"]
+dev = ["aiohttp", "black", "build", "coverage", "isort", "jinja2", "mapbox-vector-tile", "mypy", "nox", "numpy", "orjson", "pandas", "pyarrow ; python_version < \"3.14\"", "pyright", "pytest", "pytest-asyncio", "pytest-cov", "pytest-mock", "python-dateutil", "pyyaml (>=5.4)", "requests (>=2,<3)", "simsimd", "tqdm", "twine", "types-python-dateutil", "types-tqdm", "unasync"]
+docs = ["sphinx", "sphinx-autodoc-typehints", "sphinx-rtd-theme (>=2.0)"]
+orjson = ["orjson (>=3)"]
+pyarrow = ["pyarrow (>=1)"]
+requests = ["requests (>=2.4.0,!=2.32.2,<3.0.0)"]
+vectorstore-mmr = ["numpy (>=1)", "simsimd (>=3)"]
+
[[package]]
name = "email-validator"
version = "2.1.1"
@@ -1771,12 +1755,12 @@ files = [
[package.dependencies]
google-auth = ">=2.14.1,<3.0.dev0"
googleapis-common-protos = ">=1.56.2,<2.0.dev0"
-proto-plus = ">=1.22.3,<2.0.0dev"
+proto-plus = ">=1.22.3,<2.0.0.dev0"
protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0"
requests = ">=2.18.0,<3.0.0.dev0"
[package.extras]
-grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\""]
+grpc = ["grpcio (>=1.33.2,<2.0.dev0)", "grpcio (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\""]
grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
@@ -1852,11 +1836,11 @@ files = [
]
[package.dependencies]
-google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0dev"
-google-auth = ">=1.25.0,<3.0dev"
+google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0.dev0"
+google-auth = ">=1.25.0,<3.0.dev0"
[package.extras]
-grpc = ["grpcio (>=1.38.0,<2.0dev)", "grpcio-status (>=1.38.0,<2.0.dev0)"]
+grpc = ["grpcio (>=1.38.0,<2.0.dev0)", "grpcio-status (>=1.38.0,<2.0.dev0)"]
[[package]]
name = "google-cloud-storage"
@@ -1871,15 +1855,15 @@ files = [
]
[package.dependencies]
-google-api-core = ">=2.15.0,<3.0.0dev"
-google-auth = ">=2.26.1,<3.0dev"
-google-cloud-core = ">=2.3.0,<3.0dev"
-google-crc32c = ">=1.0,<2.0dev"
+google-api-core = ">=2.15.0,<3.0.0.dev0"
+google-auth = ">=2.26.1,<3.0.dev0"
+google-cloud-core = ">=2.3.0,<3.0.dev0"
+google-crc32c = ">=1.0,<2.0.dev0"
google-resumable-media = ">=2.6.0"
-requests = ">=2.18.0,<3.0.0dev"
+requests = ">=2.18.0,<3.0.0.dev0"
[package.extras]
-protobuf = ["protobuf (<5.0.0dev)"]
+protobuf = ["protobuf (<5.0.0.dev0)"]
[[package]]
name = "google-crc32c"
@@ -1934,11 +1918,11 @@ files = [
]
[package.dependencies]
-google-crc32c = ">=1.0,<2.0dev"
+google-crc32c = ">=1.0,<2.0.dev0"
[package.extras]
-aiohttp = ["aiohttp (>=3.6.2,<4.0.0dev)", "google-auth (>=1.22.0,<2.0dev)"]
-requests = ["requests (>=2.18.0,<3.0.0dev)"]
+aiohttp = ["aiohttp (>=3.6.2,<4.0.0.dev0)", "google-auth (>=1.22.0,<2.0.dev0)"]
+requests = ["requests (>=2.18.0,<3.0.0.dev0)"]
[[package]]
name = "googleapis-common-protos"
@@ -2317,7 +2301,7 @@ files = [
[package.dependencies]
attrs = ">=22.2.0"
-jsonschema-specifications = ">=2023.03.6"
+jsonschema-specifications = ">=2023.3.6"
referencing = ">=0.28.4"
rpds-py = ">=0.7.1"
@@ -3074,7 +3058,7 @@ files = [
]
[package.dependencies]
-protobuf = ">=3.19.0,<6.0.0dev"
+protobuf = ">=3.19.0,<6.0.0.dev0"
[package.extras]
testing = ["google-api-core (>=1.31.5)"]
@@ -4104,10 +4088,10 @@ files = [
]
[package.dependencies]
-botocore = ">=1.33.2,<2.0a.0"
+botocore = ">=1.33.2,<2.0a0"
[package.extras]
-crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"]
+crt = ["botocore[crt] (>=1.33.2,<2.0a0)"]
[[package]]
name = "schema"
@@ -4428,6 +4412,18 @@ files = [
{file = "types_python_dateutil-2.9.0.20240906-py3-none-any.whl", hash = "sha256:27c8cc2d058ccb14946eebcaaa503088f4f6dbc4fb6093d3d456a49aef2753f6"},
]
+[[package]]
+name = "typing-extensions"
+version = "4.15.0"
+description = "Backported and Experimental Type Hints for Python 3.9+"
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+ {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"},
+ {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"},
+]
+
[[package]]
name = "tzdata"
version = "2024.1"
@@ -4731,4 +4727,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
[metadata]
lock-version = "2.1"
python-versions = "^3.12"
-content-hash = "dfca5743cef25a20317ca8e7625404a7acbf9050461d7098684674ec109e41ee"
+content-hash = "e323eef9d1d9eb45133fd2815ac003b7cf3d7a25682df321165ed091ab49436a"
diff --git a/pyproject.toml b/pyproject.toml
index 4a776719fac..e47e851fc44 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,9 +31,10 @@ Markupsafe = "2.1.5"
blinker = "1.7.0"
furl = "2.1.3"
elasticsearch2 = "2.5.1"
-elasticsearch = "6.8.2" # max version to support elasticsearch6
-elasticsearch-dsl = "6.4.0" # max version to support elasticsearch6
-elastic-transport = "8.13.0"
+elasticsearch6= "6.8.2"
+elasticsearch6-dsl = "6.4.0"
+elasticsearch8 = "8.19.3"
+elastic-transport = "8.17.1"
google-api-python-client = "2.123.0"
google-auth = "2.29.0"
Babel = "2.14.0"
@@ -90,7 +91,7 @@ datacite = "1.1.3"
rdflib = "7.0.0"
colorlog = "6.8.2"
# Metrics
-django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "f5b9312914154e213aa01731e934c593e3434269"} # branch is feature/pin-esdsl
+django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "f2b92e5509389bb6c33f5a90c9ca4fe4e68187e2"}
# Impact Metrics CSV Export
djangorestframework-csv = "3.0.2"
gevent = "24.2.1"
diff --git a/website/settings/defaults.py b/website/settings/defaults.py
index fbe9b939ae1..ca95f9f0dab 100644
--- a/website/settings/defaults.py
+++ b/website/settings/defaults.py
@@ -107,6 +107,11 @@ def parent_dir(path):
SEARCH_ENGINE = 'elastic' # Can be 'elastic', or None
ELASTIC_URI = '127.0.0.1:9200'
ELASTIC6_URI = os.environ.get('ELASTIC6_URI', '127.0.0.1:9201')
+ELASTIC8_URI = os.environ.get('ELASTIC8_URI')
+ELASTIC8_CERT_PATH = os.environ.get('ELASTIC8_CERT_PATH')
+ELASTIC8_ASSERT_HOSTNAME = os.environ.get('ELASTIC8_ASSERT_HOSTNAME')
+ELASTIC8_USERNAME = os.environ.get('ELASTIC8_USERNAME', 'elastic')
+ELASTIC8_SECRET = os.environ.get('ELASTIC8_SECRET')
ELASTIC_TIMEOUT = 10
ELASTIC_INDEX = 'website'
ELASTIC_KWARGS = {
@@ -411,6 +416,7 @@ class CeleryConfig:
task_account_status_changes_queue = 'account_status_changes'
task_external_high_queue = 'external_high'
task_external_low_queue = 'external_low'
+ task_background_migration_queue = 'background_migration'
external_high_modules = {
'osf.tasks.log_gv_addon',
@@ -476,6 +482,10 @@ class CeleryConfig:
'scripts.enhanced_stuck_registration_audit',
}
+ background_migration_modules = {
+ 'osf.management.commands.migrate_osfmetrics_6to8',
+ }
+
try:
from kombu import Queue, Exchange
except ImportError:
@@ -529,12 +539,19 @@ class CeleryConfig:
routing_key=task_external_low_queue,
consumer_arguments={'x-priority': -2},
),
+ Queue(
+ task_background_migration_queue,
+ Exchange(task_background_migration_queue),
+ routing_key=task_background_migration_queue,
+ consumer_arguments={'x-priority': -1},
+ ),
)
task_default_exchange_type = 'direct'
task_routes = ('framework.celery_tasks.routers.CeleryRouter', )
task_ignore_result = True
task_store_errors_even_if_ignored = True
+ result_extended = True
broker_url = os.environ.get('BROKER_URL', f'amqp://{RABBITMQ_USERNAME}:{RABBITMQ_PASSWORD}@{RABBITMQ_HOST}:{RABBITMQ_PORT}/{RABBITMQ_VHOST}')
broker_use_ssl = False
@@ -583,6 +600,7 @@ class CeleryConfig:
'scripts.remove_after_use.merge_notification_subscription_provider_ct',
'scripts.disable_removed_beat_tasks',
'osf.management.commands.delete_withdrawn_or_failed_registration_files',
+ 'osf.management.commands.migrate_osfmetrics_6to8',
)
# Modules that need metrics and release requirements