From 92728649e888730b472ac9b1cddccc916d6a1b80 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Fri, 20 Mar 2026 13:10:43 +0200 Subject: [PATCH 001/100] OSF to use latest djelme --- .docker-compose.env | 1 + api/base/settings/defaults.py | 16 ++++-- docker-compose.yml | 14 +++++ osf/metrics/counted_usage.py | 2 +- osf/metrics/preprint_metrics.py | 2 +- osf/metrics/registry_metrics.py | 2 +- osf/metrics/reports.py | 2 +- osf_tests/metrics/test_daily_report.py | 2 +- osf_tests/metrics/test_metric_mixin.py | 2 +- osf_tests/metrics/test_monthly_report.py | 2 +- poetry.lock | 67 ++++++++++++++++++------ pyproject.toml | 6 ++- website/settings/defaults.py | 1 + 13 files changed, 90 insertions(+), 29 deletions(-) diff --git a/.docker-compose.env b/.docker-compose.env index 9cb7a59e274..449c9747adf 100644 --- a/.docker-compose.env +++ b/.docker-compose.env @@ -7,6 +7,7 @@ INTERNAL_DOMAIN=http://192.168.168.167:5000/ API_DOMAIN=http://localhost:8000/ ELASTIC_URI=192.168.168.167:9200 ELASTIC6_URI=192.168.168.167:9201 +ELASTIC8_URI=http://192.168.168.167:9202 OSF_DB_HOST=192.168.168.167 DB_HOST=192.168.168.167 REDIS_HOST=redis://192.168.168.167:6379 diff --git a/api/base/settings/defaults.py b/api/base/settings/defaults.py index efddf2484b8..6fd9b2a4d70 100644 --- a/api/base/settings/defaults.py +++ b/api/base/settings/defaults.py @@ -316,10 +316,18 @@ HASHIDS_SALT = 'pinkhimalayan' # django-elasticsearch-metrics -ELASTICSEARCH_DSL = { - 'default': { - 'hosts': osf_settings.ELASTIC6_URI, - 'retry_on_timeout': True, +DJELME_AUTOSETUP = True +DJELME_BACKENDS = { + "osfmetrics_es6": { + "elasticsearch_metrics.imps.elastic6": { + "hosts": osf_settings.ELASTIC6_URI, + "retry_on_timeout": True, + }, + }, + "osfmetrics_es8": { + "elasticsearch_metrics.imps.elastic8": { + "hosts": osf_settings.ELASTIC8_URI, + }, }, } # Store yearly indices for time-series metrics diff --git a/docker-compose.yml b/docker-compose.yml index f00b589f7e0..f26c3617b67 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,6 +13,8 @@ volumes: external: false elasticsearch6_data_vol: external: false + elasticsearch8_data_vol: + external: false rabbitmq_vol: external: false ember_osf_web_dist_vol: @@ -76,6 +78,18 @@ services: - elasticsearch6_data_vol:/usr/share/elasticsearch/data stdin_open: true + elasticsearch8: + image: docker.elastic.co/elasticsearch/elasticsearch:8.19.11 + platform: linux/arm64 + environment: + - xpack.security.enabled=false + - discovery.type=single-node + ports: + - 9202:9200 + volumes: + - elasticsearch8_data_vol:/usr/share/elasticsearch/data + stdin_open: true + postgres: image: postgres:15.4 command: diff --git a/osf/metrics/counted_usage.py b/osf/metrics/counted_usage.py index 39b3b74129b..41ea012fda5 100644 --- a/osf/metrics/counted_usage.py +++ b/osf/metrics/counted_usage.py @@ -4,7 +4,7 @@ from urllib.parse import urlsplit from elasticsearch6_dsl import InnerDoc, analyzer, tokenizer -from elasticsearch_metrics import metrics +import elasticsearch_metrics.imps.elastic6 as metrics from elasticsearch_metrics.signals import pre_save from django.dispatch import receiver import pytz diff --git a/osf/metrics/preprint_metrics.py b/osf/metrics/preprint_metrics.py index 9d02ec191a2..c45cec4f24a 100644 --- a/osf/metrics/preprint_metrics.py +++ b/osf/metrics/preprint_metrics.py @@ -1,5 +1,5 @@ from elasticsearch.exceptions import NotFoundError -from elasticsearch_metrics import metrics +import elasticsearch_metrics.imps.elastic6 as metrics from .metric_mixin import MetricMixin diff --git a/osf/metrics/registry_metrics.py b/osf/metrics/registry_metrics.py index 475dca28673..9c779fe8c0b 100644 --- a/osf/metrics/registry_metrics.py +++ b/osf/metrics/registry_metrics.py @@ -1,4 +1,4 @@ -from elasticsearch_metrics import metrics +import elasticsearch_metrics.imps.elastic6 as metrics from osf.utils.workflows import RegistrationModerationTriggers, RegistrationModerationStates from .metric_mixin import MetricMixin diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py index ffbcfb4c9b8..9d71ea7e8c2 100644 --- a/osf/metrics/reports.py +++ b/osf/metrics/reports.py @@ -4,7 +4,7 @@ from django.dispatch import receiver from elasticsearch6_dsl import InnerDoc -from elasticsearch_metrics import metrics +import elasticsearch_metrics.imps.elastic6 as metrics from elasticsearch_metrics.signals import pre_save as metrics_pre_save from osf.metrics.utils import stable_key, YearMonth diff --git a/osf_tests/metrics/test_daily_report.py b/osf_tests/metrics/test_daily_report.py index 46375184f95..9301cdb114f 100644 --- a/osf_tests/metrics/test_daily_report.py +++ b/osf_tests/metrics/test_daily_report.py @@ -2,7 +2,7 @@ from unittest import mock import pytest -from elasticsearch_metrics import metrics +import elasticsearch_metrics.imps.elastic6 as metrics from osf.metrics.reports import DailyReport, ReportInvalid diff --git a/osf_tests/metrics/test_metric_mixin.py b/osf_tests/metrics/test_metric_mixin.py index 4a2c32f7e71..ec9b2d302de 100644 --- a/osf_tests/metrics/test_metric_mixin.py +++ b/osf_tests/metrics/test_metric_mixin.py @@ -1,6 +1,6 @@ from unittest import mock import pytest -from elasticsearch_metrics import metrics +import elasticsearch_metrics.imps.elastic6 as metrics from osf.metrics.metric_mixin import MetricMixin from osf.models import OSFUser diff --git a/osf_tests/metrics/test_monthly_report.py b/osf_tests/metrics/test_monthly_report.py index 3c841e6555c..cc8c4137cb2 100644 --- a/osf_tests/metrics/test_monthly_report.py +++ b/osf_tests/metrics/test_monthly_report.py @@ -2,7 +2,7 @@ from unittest import mock import pytest -from elasticsearch_metrics import metrics +import elasticsearch_metrics.imps.elastic6 as metrics from osf.metrics.reports import MonthlyReport, ReportInvalid, PublicItemUsageReport from osf.metrics.utils import YearMonth diff --git a/poetry.lock b/poetry.lock index 83ca13f7a00..062070e3f1e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1085,27 +1085,24 @@ Django = ">=2.0" [[package]] name = "django-elasticsearch-metrics" -version = "2022.0.6" +version = "2026.0.0" description = "Django app for storing time-series metrics in Elasticsearch." optional = false -python-versions = "*" +python-versions = ">=3.10,<4" groups = ["main"] files = [] develop = false -[package.dependencies] -elasticsearch6-dsl = ">=6.3.0,<7.0.0" - [package.extras] -dev = ["factory-boy (==2.11.1)", "flake8 (==5.0.4)", "flake8-bugbear (==18.8.0)", "konch (>=3.0.0)", "mock", "pre-commit (==2.17.0)", "pytest", "pytest-django (==3.10.0)", "tox"] -lint = ["flake8 (==5.0.4)", "flake8-bugbear (==18.8.0)", "pre-commit (==2.17.0)"] -tests = ["factory-boy (==2.11.1)", "mock", "pytest", "pytest-django (==3.10.0)"] +anydjango = ["django"] +elastic6 = ["elasticsearch6-dsl (>=6.3.0,<7.0.0)"] +elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "f5b9312914154e213aa01731e934c593e3434269" -resolved_reference = "f5b9312914154e213aa01731e934c593e3434269" +reference = "36fc0bbe001ee792f4ab5723eaeec295b8a20eb2" +resolved_reference = "36fc0bbe001ee792f4ab5723eaeec295b8a20eb2" [[package]] name = "django-extensions" @@ -1361,14 +1358,14 @@ stone = ">=2" [[package]] name = "elastic-transport" -version = "8.13.0" +version = "8.17.1" description = "Transport classes and utilities shared among Python Elastic client libraries" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" groups = ["main"] files = [ - {file = "elastic-transport-8.13.0.tar.gz", hash = "sha256:2410ec1ff51221e8b3a01c0afa9f0d0498e1386a269283801f5c12f98e42dc45"}, - {file = "elastic_transport-8.13.0-py3-none-any.whl", hash = "sha256:aec890afdddd057762b27ff3553b0be8fa4673ec1a4fd922dfbd00325874bb3d"}, + {file = "elastic_transport-8.17.1-py3-none-any.whl", hash = "sha256:192718f498f1d10c5e9aa8b9cf32aed405e469a7f0e9d6a8923431dbb2c59fb8"}, + {file = "elastic_transport-8.17.1.tar.gz", hash = "sha256:5edef32ac864dca8e2f0a613ef63491ee8d6b8cfb52881fa7313ba9290cac6d2"}, ] [package.dependencies] @@ -1376,7 +1373,7 @@ certifi = "*" urllib3 = ">=1.26.2,<3" [package.extras] -develop = ["aiohttp", "furo", "httpx", "mock", "opentelemetry-api", "opentelemetry-sdk", "orjson", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "respx", "sphinx (>2)", "sphinx-autodoc-typehints", "trustme"] +develop = ["aiohttp", "furo", "httpx", "opentelemetry-api", "opentelemetry-sdk", "orjson", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "respx", "sphinx (>2)", "sphinx-autodoc-typehints", "trustme"] [[package]] name = "elasticsearch" @@ -1471,6 +1468,32 @@ six = "*" [package.extras] develop = ["coverage (<5.0.0)", "mock", "pytest (>=3.0.0)", "pytest-cov", "pytz", "sphinx", "sphinx-rtd-theme"] +[[package]] +name = "elasticsearch8" +version = "8.19.3" +description = "Python client for Elasticsearch" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "elasticsearch8-8.19.3-py3-none-any.whl", hash = "sha256:4b52e59e68aea6f59bf37c28f6f4512333302dd8a52e26c17d0f10c076d833a1"}, + {file = "elasticsearch8-8.19.3.tar.gz", hash = "sha256:7effe95b360241b6d56ef68219037a90ad0f56723614db54bbe57d33058402f4"}, +] + +[package.dependencies] +elastic-transport = ">=8.15.1,<9" +python-dateutil = "*" +typing-extensions = "*" + +[package.extras] +async = ["aiohttp (>=3,<4)"] +dev = ["aiohttp", "black", "build", "coverage", "isort", "jinja2", "mapbox-vector-tile", "mypy", "nox", "numpy", "orjson", "pandas", "pyarrow ; python_version < \"3.14\"", "pyright", "pytest", "pytest-asyncio", "pytest-cov", "pytest-mock", "python-dateutil", "pyyaml (>=5.4)", "requests (>=2,<3)", "simsimd", "tqdm", "twine", "types-python-dateutil", "types-tqdm", "unasync"] +docs = ["sphinx", "sphinx-autodoc-typehints", "sphinx-rtd-theme (>=2.0)"] +orjson = ["orjson (>=3)"] +pyarrow = ["pyarrow (>=1)"] +requests = ["requests (>=2.4.0,!=2.32.2,<3.0.0)"] +vectorstore-mmr = ["numpy (>=1)", "simsimd (>=3)"] + [[package]] name = "email-validator" version = "2.1.1" @@ -4412,6 +4435,18 @@ files = [ {file = "types_python_dateutil-2.9.0.20240906-py3-none-any.whl", hash = "sha256:27c8cc2d058ccb14946eebcaaa503088f4f6dbc4fb6093d3d456a49aef2753f6"}, ] +[[package]] +name = "typing-extensions" +version = "4.15.0" +description = "Backported and Experimental Type Hints for Python 3.9+" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, + {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, +] + [[package]] name = "tzdata" version = "2024.1" @@ -4715,4 +4750,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "2bc7e95f03d05e8b3335514e887b590acdab5cb2a44fc47bde870bdf8e465bf2" +content-hash = "2175d011bdf45da06721a3cf70377730725e59a52f6ac4357152c38e4427d1e3" diff --git a/pyproject.toml b/pyproject.toml index b1646584209..700b836895b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,9 @@ furl = "2.1.3" elasticsearch2 = "2.5.1" elasticsearch = "6.8.2" # max version to support elasticsearch6 elasticsearch-dsl = "6.4.0" # max version to support elasticsearch6 -elastic-transport = "8.13.0" +elasticsearch6-dsl = "6.4.0" +elasticsearch8 = "8.19.3" +elastic-transport = "8.17.1" google-api-python-client = "2.123.0" google-auth = "2.29.0" Babel = "2.14.0" @@ -90,7 +92,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "f5b9312914154e213aa01731e934c593e3434269"} # branch is feature/pin-esdsl +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "36fc0bbe001ee792f4ab5723eaeec295b8a20eb2"} # branch is feature/pin-esdsl # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" diff --git a/website/settings/defaults.py b/website/settings/defaults.py index 2d3dcecba3b..d0ae58dc863 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -113,6 +113,7 @@ def parent_dir(path): SEARCH_ENGINE = 'elastic' # Can be 'elastic', or None ELASTIC_URI = '127.0.0.1:9200' ELASTIC6_URI = os.environ.get('ELASTIC6_URI', '127.0.0.1:9201') +ELASTIC8_URI = os.environ.get('ELASTIC8_URI', '127.0.0.1:9202') ELASTIC_TIMEOUT = 10 ELASTIC_INDEX = 'website' ELASTIC_KWARGS = { From 590f7a2c405d1c30dc012d0d436f843e8c1afa19 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Mon, 23 Mar 2026 15:04:59 +0200 Subject: [PATCH 002/100] fix test fails --- api/base/settings/defaults.py | 14 +++++----- conftest.py | 50 ++++++++--------------------------- 2 files changed, 18 insertions(+), 46 deletions(-) diff --git a/api/base/settings/defaults.py b/api/base/settings/defaults.py index 6fd9b2a4d70..816586ffcfb 100644 --- a/api/base/settings/defaults.py +++ b/api/base/settings/defaults.py @@ -318,15 +318,15 @@ # django-elasticsearch-metrics DJELME_AUTOSETUP = True DJELME_BACKENDS = { - "osfmetrics_es6": { - "elasticsearch_metrics.imps.elastic6": { - "hosts": osf_settings.ELASTIC6_URI, - "retry_on_timeout": True, + 'osfmetrics_es6': { + 'elasticsearch_metrics.imps.elastic6': { + 'hosts': osf_settings.ELASTIC6_URI, + 'retry_on_timeout': True, }, }, - "osfmetrics_es8": { - "elasticsearch_metrics.imps.elastic8": { - "hosts": osf_settings.ELASTIC8_URI, + 'osfmetrics_es8': { + 'elasticsearch_metrics.imps.elastic8': { + 'hosts': osf_settings.ELASTIC8_URI, }, }, } diff --git a/conftest.py b/conftest.py index 9494e3d296e..59b8def87c9 100644 --- a/conftest.py +++ b/conftest.py @@ -1,14 +1,11 @@ -import contextlib from unittest import mock import logging import os import re -from django.core.management import call_command from django.db import transaction -from elasticsearch import exceptions as es_exceptions from elasticsearch_dsl.connections import connections -from elasticsearch_metrics.registry import registry as es_metrics_registry +from elasticsearch_metrics.tests._test_util import RealElasticTestCase from faker import Factory import pytest import responses @@ -138,47 +135,22 @@ def es6_client(setup_connections): @pytest.fixture(scope='function', autouse=True) -def _es_metrics_marker(request, worker_id): +def _es_metrics_marker(request): """Clear out all indices and index templates before and after tests marked with `es_metrics`. """ marker = request.node.get_closest_marker('es_metrics') - if marker: - es6_client = request.getfixturevalue('es6_client') - _temp_prefix = 'temp_metrics_' - _temp_wildcard = f'{_temp_prefix}-{worker_id}*' - - def _teardown_es_temps(): - es6_client.indices.delete(index=_temp_wildcard) - try: - es6_client.indices.delete_template(_temp_wildcard) - except es_exceptions.NotFoundError: - pass - - @contextlib.contextmanager - def _mock_metric_names(): - with contextlib.ExitStack() as _exit: - for _metric_class in es_metrics_registry.get_metrics(): - _exit.enter_context(mock.patch.object( - _metric_class, - '_template_name', # also used to construct index names - f'{_temp_prefix}-{worker_id}{_metric_class._template_name}', - )) - _exit.enter_context(mock.patch.object( - _metric_class, - '_template', # a wildcard string for indexes and templates - f'{_temp_prefix}-{worker_id}{_metric_class._template}', - )) - yield - - _teardown_es_temps() - with _mock_metric_names(): - call_command('sync_metrics') - yield - _teardown_es_temps() - else: + + if not marker: yield + return + + es6_test_case = RealElasticTestCase() + es6_test_case.setup_backends() + + yield + es6_test_case.teardown_backends() @pytest.fixture def mock_share_responses(): From 6ebdd8eb8d195f5727ca07ef1bf8ebc95ed8b045 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Mon, 23 Mar 2026 15:26:33 +0200 Subject: [PATCH 003/100] fix poetry issue --- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index 062070e3f1e..fe2f9cc9721 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1085,7 +1085,7 @@ Django = ">=2.0" [[package]] name = "django-elasticsearch-metrics" -version = "2026.0.0" +version = "2026.0.3" description = "Django app for storing time-series metrics in Elasticsearch." optional = false python-versions = ">=3.10,<4" @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "36fc0bbe001ee792f4ab5723eaeec295b8a20eb2" -resolved_reference = "36fc0bbe001ee792f4ab5723eaeec295b8a20eb2" +reference = "7a7f664469070dd52dc4d9401f6b6d2d9fe7ddf0" +resolved_reference = "7a7f664469070dd52dc4d9401f6b6d2d9fe7ddf0" [[package]] name = "django-extensions" @@ -4750,4 +4750,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "2175d011bdf45da06721a3cf70377730725e59a52f6ac4357152c38e4427d1e3" +content-hash = "68eafe36dddc2e9380dc164244e54830ed593d23af348df72f776c40b102d99c" diff --git a/pyproject.toml b/pyproject.toml index 700b836895b..48c5e8dd0a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,7 +92,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "36fc0bbe001ee792f4ab5723eaeec295b8a20eb2"} # branch is feature/pin-esdsl +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "7a7f664469070dd52dc4d9401f6b6d2d9fe7ddf0"} # branch is feature/pin-esdsl # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From fbe2a0813b86f96687c0434ed2693e9299ac6676 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Mon, 23 Mar 2026 16:50:46 +0200 Subject: [PATCH 004/100] add connection --- conftest.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/conftest.py b/conftest.py index 59b8def87c9..a0edc40b20a 100644 --- a/conftest.py +++ b/conftest.py @@ -5,6 +5,7 @@ from django.db import transaction from elasticsearch_dsl.connections import connections +from website import settings as osf_settings from elasticsearch_metrics.tests._test_util import RealElasticTestCase from faker import Factory import pytest @@ -145,6 +146,11 @@ def _es_metrics_marker(request): yield return + connections.create_connection( + alias='osfmetrics_es6', + hosts=osf_settings.ELASTIC6_URI, + ) + es6_test_case = RealElasticTestCase() es6_test_case.setup_backends() From ea78a15f04217d14476a19ad7ba46bbab04e2d44 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Tue, 24 Mar 2026 16:30:34 +0200 Subject: [PATCH 005/100] remove connection, add proper setUp and tearDown --- conftest.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/conftest.py b/conftest.py index a0edc40b20a..d08c7b1fdff 100644 --- a/conftest.py +++ b/conftest.py @@ -146,17 +146,13 @@ def _es_metrics_marker(request): yield return - connections.create_connection( - alias='osfmetrics_es6', - hosts=osf_settings.ELASTIC6_URI, - ) - - es6_test_case = RealElasticTestCase() - es6_test_case.setup_backends() - - yield - - es6_test_case.teardown_backends() + class _Es6TestCase(RealElasticTestCase, autosetup_djelme_backends=True): ... + es6_test_case = _Es6TestCase() + es6_test_case.setUp() + try: + yield + finally: + es6_test_case.tearDown() @pytest.fixture def mock_share_responses(): From 0efd0b13806ec84da8b77af5c294e7f5af966883 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Wed, 25 Mar 2026 13:52:16 +0200 Subject: [PATCH 006/100] remove elasticsearch and elasticsearch-dsl --- addons/base/views.py | 2 +- api/metrics/views.py | 2 +- api_tests/metrics/test_preprint_metrics.py | 2 +- conftest.py | 5 +++ .../commands/monthly_reporters_go.py | 2 +- osf/metrics/metric_mixin.py | 2 +- osf/metrics/preprint_metrics.py | 2 +- poetry.lock | 41 +------------------ pyproject.toml | 3 +- 9 files changed, 13 insertions(+), 48 deletions(-) diff --git a/addons/base/views.py b/addons/base/views.py index 5ff3d6e7093..ebcd662966b 100644 --- a/addons/base/views.py +++ b/addons/base/views.py @@ -14,7 +14,7 @@ import waffle from django.db import transaction from django.contrib.contenttypes.models import ContentType -from elasticsearch import exceptions as es_exceptions +from elasticsearch6 import exceptions as es_exceptions from rest_framework import status as http_status from api.caching.tasks import update_storage_usage_with_size diff --git a/api/metrics/views.py b/api/metrics/views.py index daaa684d13a..976837ec220 100644 --- a/api/metrics/views.py +++ b/api/metrics/views.py @@ -6,7 +6,7 @@ from django.http import JsonResponse, HttpResponse, Http404 from django.utils import timezone -from elasticsearch.exceptions import NotFoundError, RequestError +from elasticsearch6.exceptions import NotFoundError, RequestError from elasticsearch_dsl.connections import get_connection from framework.auth.oauth_scopes import CoreScopes diff --git a/api_tests/metrics/test_preprint_metrics.py b/api_tests/metrics/test_preprint_metrics.py index 1bde8719b75..cd9b8041c2d 100644 --- a/api_tests/metrics/test_preprint_metrics.py +++ b/api_tests/metrics/test_preprint_metrics.py @@ -8,7 +8,7 @@ from django.utils import timezone from waffle.testutils import override_switch -from elasticsearch.exceptions import RequestError +from elasticsearch6.exceptions import RequestError from osf import features from api.base.settings import API_PRIVATE_BASE as API_BASE diff --git a/conftest.py b/conftest.py index d08c7b1fdff..6a6be40a7d9 100644 --- a/conftest.py +++ b/conftest.py @@ -146,6 +146,11 @@ def _es_metrics_marker(request): yield return + connections.create_connection( + alias='osfmetrics_es6', + hosts=osf_settings.ELASTIC6_URI, + ) + class _Es6TestCase(RealElasticTestCase, autosetup_djelme_backends=True): ... es6_test_case = _Es6TestCase() es6_test_case.setUp() diff --git a/osf/management/commands/monthly_reporters_go.py b/osf/management/commands/monthly_reporters_go.py index 83ed5f6d985..218b45da1df 100644 --- a/osf/management/commands/monthly_reporters_go.py +++ b/osf/management/commands/monthly_reporters_go.py @@ -3,7 +3,7 @@ from django.core.management.base import BaseCommand from django.db import OperationalError as DjangoOperationalError -from elasticsearch.exceptions import ConnectionError as ElasticConnectionError +from elasticsearch6.exceptions import ConnectionError as ElasticConnectionError from psycopg2 import OperationalError as PostgresOperationalError from framework.celery_tasks import app as celery_app diff --git a/osf/metrics/metric_mixin.py b/osf/metrics/metric_mixin.py index 724ab1958da..df87d5123b1 100644 --- a/osf/metrics/metric_mixin.py +++ b/osf/metrics/metric_mixin.py @@ -2,7 +2,7 @@ from django.db import models from django.utils import timezone -from elasticsearch.exceptions import NotFoundError +from elasticsearch6.exceptions import NotFoundError import pytz diff --git a/osf/metrics/preprint_metrics.py b/osf/metrics/preprint_metrics.py index c45cec4f24a..d284d80827e 100644 --- a/osf/metrics/preprint_metrics.py +++ b/osf/metrics/preprint_metrics.py @@ -1,4 +1,4 @@ -from elasticsearch.exceptions import NotFoundError +from elasticsearch6.exceptions import NotFoundError import elasticsearch_metrics.imps.elastic6 as metrics from .metric_mixin import MetricMixin diff --git a/poetry.lock b/poetry.lock index fe2f9cc9721..530b6252e18 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1375,45 +1375,6 @@ urllib3 = ">=1.26.2,<3" [package.extras] develop = ["aiohttp", "furo", "httpx", "opentelemetry-api", "opentelemetry-sdk", "orjson", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "respx", "sphinx (>2)", "sphinx-autodoc-typehints", "trustme"] -[[package]] -name = "elasticsearch" -version = "6.8.2" -description = "Python client for Elasticsearch" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, <4" -groups = ["main"] -files = [ - {file = "elasticsearch-6.8.2-py2.py3-none-any.whl", hash = "sha256:1aedf00b73f5d1e77cb4df70fec58f2efb664be4ce2686374239aa6c0373c65c"}, - {file = "elasticsearch-6.8.2.tar.gz", hash = "sha256:c3a560bb83e4981b5a5c82080d2ceb99686d33692ef53365656129478aa5ddb2"}, -] - -[package.dependencies] -urllib3 = ">=1.21.1" - -[package.extras] -develop = ["coverage", "mock", "nose", "nosexcover", "numpy", "pandas", "pyyaml", "requests (>=2.0.0,<3.0.0)", "sphinx (<1.7)", "sphinx-rtd-theme"] -requests = ["requests (>=2.4.0,<3.0.0)"] - -[[package]] -name = "elasticsearch-dsl" -version = "6.4.0" -description = "Python client for Elasticsearch" -optional = false -python-versions = "*" -groups = ["main"] -files = [ - {file = "elasticsearch-dsl-6.4.0.tar.gz", hash = "sha256:26416f4dd46ceca43d62ef74970d9de4bdd6f4b0f163316f0b432c9e61a08bec"}, - {file = "elasticsearch_dsl-6.4.0-py2.py3-none-any.whl", hash = "sha256:f60aea7fd756ac1fbe7ce114bbf4949aefbf495dfe8896640e787c67344f12f6"}, -] - -[package.dependencies] -elasticsearch = ">=6.0.0,<7.0.0" -python-dateutil = "*" -six = "*" - -[package.extras] -develop = ["coverage (<5.0.0)", "mock", "pytest (>=3.0.0)", "pytest-cov", "pytz", "sphinx", "sphinx-rtd-theme"] - [[package]] name = "elasticsearch2" version = "2.5.1" @@ -4750,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "68eafe36dddc2e9380dc164244e54830ed593d23af348df72f776c40b102d99c" +content-hash = "755718b3333d5fe7983ac875532429e80eb2d45684002ae9212bc96a7800d014" diff --git a/pyproject.toml b/pyproject.toml index 48c5e8dd0a5..f40ad4fb107 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,8 +31,7 @@ Markupsafe = "2.1.5" blinker = "1.7.0" furl = "2.1.3" elasticsearch2 = "2.5.1" -elasticsearch = "6.8.2" # max version to support elasticsearch6 -elasticsearch-dsl = "6.4.0" # max version to support elasticsearch6 +elasticsearch6= "6.8.2" elasticsearch6-dsl = "6.4.0" elasticsearch8 = "8.19.3" elastic-transport = "8.17.1" From 684a83f1937da1cd1eb2c21f389199ada165102a Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Wed, 25 Mar 2026 14:04:48 +0200 Subject: [PATCH 007/100] remove elasticsearch-dsl --- api/base/elasticsearch_dsl_views.py | 8 ++++---- api/metrics/views.py | 2 +- conftest.py | 2 +- osf/management/commands/reindex_es6.py | 2 +- osf/metrics/reporters/public_item_usage.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/api/base/elasticsearch_dsl_views.py b/api/base/elasticsearch_dsl_views.py index 6199fd82d0e..ecf2825d4e8 100644 --- a/api/base/elasticsearch_dsl_views.py +++ b/api/base/elasticsearch_dsl_views.py @@ -3,7 +3,7 @@ import datetime import typing -import elasticsearch_dsl as edsl +import elasticsearch6_dsl as edsl from rest_framework import generics, exceptions as drf_exceptions from rest_framework.settings import api_settings as drf_settings from api.base.settings.defaults import REPORT_FILENAME_FORMAT @@ -23,7 +23,7 @@ class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView, abc.ABC): - '''abstract view class using `elasticsearch_dsl.Search` as a queryset-analogue + '''abstract view class using `elasticsearch6_dsl.Search` as a queryset-analogue builds a `Search` based on `self.get_default_search()` and the request's query parameters for filtering, sorting, and pagination -- fetches only @@ -36,7 +36,7 @@ class ElasticsearchListView(FilterMixin, JSONAPIBaseView, generics.ListAPIView, @abc.abstractmethod def get_default_search(self) -> edsl.Search | None: - '''the base `elasticsearch_dsl.Search` for this list, based on url path + '''the base `elasticsearch6_dsl.Search` for this list, based on url path (common jsonapi query parameters will be considered automatically) ''' @@ -95,7 +95,7 @@ def finalize_response(self, request, response, *args, **kwargs): # (filtering handled in-view to reuse logic from FilterMixin) filter_backends = () - # note: because elasticsearch_dsl.Search supports slicing and gives results when iterated on, + # note: because elasticsearch6_dsl.Search supports slicing and gives results when iterated on, # it works fine with default pagination # override rest_framework.generics.GenericAPIView diff --git a/api/metrics/views.py b/api/metrics/views.py index 976837ec220..c6e4d56c9b9 100644 --- a/api/metrics/views.py +++ b/api/metrics/views.py @@ -7,7 +7,7 @@ from django.utils import timezone from elasticsearch6.exceptions import NotFoundError, RequestError -from elasticsearch_dsl.connections import get_connection +from elasticsearch6_dsl.connections import get_connection from framework.auth.oauth_scopes import CoreScopes diff --git a/conftest.py b/conftest.py index 6a6be40a7d9..7665782cb34 100644 --- a/conftest.py +++ b/conftest.py @@ -4,7 +4,7 @@ import re from django.db import transaction -from elasticsearch_dsl.connections import connections +from elasticsearch6_dsl.connections import connections from website import settings as osf_settings from elasticsearch_metrics.tests._test_util import RealElasticTestCase from faker import Factory diff --git a/osf/management/commands/reindex_es6.py b/osf/management/commands/reindex_es6.py index c37d0e34f2c..8961ea6fff1 100644 --- a/osf/management/commands/reindex_es6.py +++ b/osf/management/commands/reindex_es6.py @@ -4,7 +4,7 @@ import logging from django.core.management.base import BaseCommand -from elasticsearch_dsl import connections +from elasticsearch6_dsl import connections from elasticsearch_metrics.registry import registry logger = logging.getLogger(__name__) diff --git a/osf/metrics/reporters/public_item_usage.py b/osf/metrics/reporters/public_item_usage.py index cc401d50bd7..7df405d385f 100644 --- a/osf/metrics/reporters/public_item_usage.py +++ b/osf/metrics/reporters/public_item_usage.py @@ -4,7 +4,7 @@ import waffle if typing.TYPE_CHECKING: - import elasticsearch_dsl as edsl + import elasticsearch6_dsl as edsl import osf.features from osf.metadata.osf_gathering import OsfmapPartition From 11516100b737a81e9c9aeb88819c786dfcffb214 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Wed, 25 Mar 2026 16:15:24 +0200 Subject: [PATCH 008/100] remove sleep() and refresh indices --- .../views/test_institution_department_list.py | 11 ++++++----- .../views/test_institution_summary_metrics.py | 7 ++++++- .../views/test_institution_user_metric_list.py | 10 +++++++++- api_tests/metrics/test_composite_query.py | 2 +- .../metrics/test_registries_moderation_metrics.py | 4 ++-- conftest.py | 3 ++- 6 files changed, 26 insertions(+), 11 deletions(-) diff --git a/api_tests/institutions/views/test_institution_department_list.py b/api_tests/institutions/views/test_institution_department_list.py index c2a5c0fcf99..8b785504756 100644 --- a/api_tests/institutions/views/test_institution_department_list.py +++ b/api_tests/institutions/views/test_institution_department_list.py @@ -44,7 +44,7 @@ def populate_counts(self, user, user2, user3, user4, admin, institution): department_name='Old Department', public_project_count=1, private_project_count=1, - ).save(refresh=True) + ).save() _this_month = YearMonth.from_date(datetime.date.today()) @@ -56,7 +56,7 @@ def populate_counts(self, user, user2, user3, user4, admin, institution): department_name='New Department', public_project_count=1, private_project_count=1, - ).save(refresh=True) + ).save() # A second user entered the department InstitutionalUserReport( @@ -66,7 +66,7 @@ def populate_counts(self, user, user2, user3, user4, admin, institution): department_name='New Department', public_project_count=1, private_project_count=1, - ).save(refresh=True) + ).save() # A new department with a single user to test sorting InstitutionalUserReport( @@ -76,7 +76,7 @@ def populate_counts(self, user, user2, user3, user4, admin, institution): department_name='Smaller Department', public_project_count=1, private_project_count=1, - ).save(refresh=True) + ).save() # A user with no department InstitutionalUserReport( @@ -85,7 +85,7 @@ def populate_counts(self, user, user2, user3, user4, admin, institution): institution_id=institution._id, public_project_count=1, private_project_count=1, - ).save(refresh=True) + ).save() @pytest.fixture() def admin(self, institution): @@ -113,6 +113,7 @@ def test_auth(self, app, url, user, admin): assert resp.json['data'] == [] def test_get(self, app, url, admin, institution, populate_counts): + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) resp = app.get(url, auth=admin.auth) assert resp.json['data'] == [{ diff --git a/api_tests/institutions/views/test_institution_summary_metrics.py b/api_tests/institutions/views/test_institution_summary_metrics.py index 41983458d2e..6dd6c5bbda3 100644 --- a/api_tests/institutions/views/test_institution_summary_metrics.py +++ b/api_tests/institutions/views/test_institution_summary_metrics.py @@ -84,6 +84,7 @@ def test_get_empty(self, app, url, institutional_admin): assert resp.json['meta'] == {'version': '2.0'} def test_get_report(self, app, url, institutional_admin, institution, reports, unshown_reports): + InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) resp = app.get(url, auth=institutional_admin.auth) assert resp.status_code == 200 @@ -149,6 +150,7 @@ def test_get_report_with_multiple_months_and_institutions( monthly_logged_in_user_count=270, monthly_active_user_count=260, ) + InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) resp = app.get(url, auth=institutional_admin.auth) assert resp.status_code == 200 @@ -189,6 +191,7 @@ def test_get_with_valid_report_dates(self, app, url, institution, institutional_ institution, user_count=4133, ) + InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) resp = app.get(f'{url}?report_yearmonth=2024-08', auth=institutional_admin.auth) assert resp.status_code == 200 @@ -213,6 +216,7 @@ def test_get_with_invalid_report_date(self, app, url, institution, institutional institution, user_count=999, ) + InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) # Request with an invalid report_date format resp = app.get(f'{url}?report_yearmonth=invalid-date', auth=institutional_admin.auth) @@ -233,6 +237,7 @@ def test_get_without_report_date_uses_most_recent(self, app, url, institution, i institution, user_count=999, ) + InstitutionMonthlySummaryReport._get_connection().indices.refresh(InstitutionMonthlySummaryReport._template_pattern) resp = app.get(url, auth=institutional_admin.auth) assert resp.status_code == 200 @@ -247,5 +252,5 @@ def _summary_report_factory(yearmonth, institution, **kwargs): institution_id=institution._id, **kwargs, ) - report.save(refresh=True) + report.save() return report diff --git a/api_tests/institutions/views/test_institution_user_metric_list.py b/api_tests/institutions/views/test_institution_user_metric_list.py index 0826dcd0161..d2b99da435f 100644 --- a/api_tests/institutions/views/test_institution_user_metric_list.py +++ b/api_tests/institutions/views/test_institution_user_metric_list.py @@ -89,6 +89,7 @@ def test_get_empty(self, app, url, institutional_admin): assert _resp.json['data'] == [] def test_get_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) _resp = app.get(url, auth=institutional_admin.auth) assert _resp.status_code == 200 assert len(_resp.json['data']) == len(reports) @@ -100,6 +101,7 @@ def test_get_reports(self, app, url, institutional_admin, institution, reports, assert len(response_object['attributes']['contacts']) == 0 def test_filter_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) for _query, _expected_user_ids in ( ({'filter[department]': 'nunavum'}, set()), ({'filter[department]': 'incidentally'}, set()), @@ -135,6 +137,7 @@ def test_filter_reports(self, app, url, institutional_admin, institution, report assert set(_user_ids(_resp)) == _expected_user_ids def test_sort_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) for _query, _expected_user_id_list in ( ({'sort': 'storage_byte_count'}, ['u_sparse', 'u_orc', 'u_blargl', 'u_orcomma']), ({'sort': '-storage_byte_count'}, ['u_orcomma', 'u_blargl', 'u_orc', 'u_sparse']), @@ -144,6 +147,7 @@ def test_sort_reports(self, app, url, institutional_admin, institution, reports, assert list(_user_ids(_resp)) == _expected_user_id_list def test_paginate_reports(self, app, url, institutional_admin, institution, reports, unshown_reports): + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) for _query, _expected_user_id_list in ( ({'sort': 'storage_byte_count', 'page[size]': 2}, ['u_sparse', 'u_orc']), ({'sort': 'storage_byte_count', 'page[size]': 2, 'page': 2}, ['u_blargl', 'u_orcomma']), @@ -178,6 +182,7 @@ def test_get_report_formats_csv_tsv(self, app, url, institutional_admin, institu month_last_active='2018-02', month_last_login='2018-02', ) + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth) assert resp.status_code == 200 @@ -281,6 +286,7 @@ def test_csv_tsv_ignores_pagination(self, app, url, institutional_admin, institu str(736662999298 + i), f'Jalen Hurts #{i}', ]) + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) # Make request for CSV format with page[size]=10 resp = app.get(f'{url}?format={format_type}', auth=institutional_admin.auth) @@ -346,6 +352,7 @@ def test_get_report_format_table_json(self, app, url, institutional_admin, insti month_last_active='2018-02', month_last_login='2018-02', ) + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) resp = app.get(f'{url}?format=json_report', auth=institutional_admin.auth) assert resp.status_code == 200 @@ -411,6 +418,7 @@ def test_correct_number_of_contact_messages(self, app, url, institutional_admin, department_name='a department, or so, that happens, incidentally, to have commas', storage_byte_count=736662999298, ) + InstitutionalUserReport._get_connection().indices.refresh(InstitutionalUserReport._template_pattern) receiver = user1 with capture_notifications(): @@ -477,5 +485,5 @@ def _report_factory(yearmonth, institution, **kwargs): institution_id=institution._id, **kwargs, ) - _report.save(refresh=True) + _report.save() return _report diff --git a/api_tests/metrics/test_composite_query.py b/api_tests/metrics/test_composite_query.py index 0cd0b3bb180..a983ebd74fb 100644 --- a/api_tests/metrics/test_composite_query.py +++ b/api_tests/metrics/test_composite_query.py @@ -75,7 +75,7 @@ def test_elasticsearch_agg_query(self, app, user, base_url, preprint): path=preprint.primary_file.path, timestamp=datetime(year=2020, month=2, day=1) ) - time.sleep(1) # gives ES some time to update + PreprintDownload._get_connection().indices.refresh(PreprintDownload._template_pattern) resp = app.post_json_api(post_url, payload, auth=user.auth) assert resp.status_code == 200 diff --git a/api_tests/metrics/test_registries_moderation_metrics.py b/api_tests/metrics/test_registries_moderation_metrics.py index 93469b1b3b5..7f59a0a417e 100644 --- a/api_tests/metrics/test_registries_moderation_metrics.py +++ b/api_tests/metrics/test_registries_moderation_metrics.py @@ -32,7 +32,7 @@ def test_record_transitions(self, registration): registration.creator, 'Metrics is easy' ) - time.sleep(1) + RegistriesModerationMetrics._get_connection().indices.refresh(RegistriesModerationMetrics._template_pattern) assert RegistriesModerationMetrics.search().count() == 1 data = RegistriesModerationMetrics.search().execute()['hits']['hits'][0]['_source'] @@ -81,7 +81,7 @@ def test_registries_moderation_view(self, app, user, base_url, registration): registration.creator, 'Metrics is easy' ) - time.sleep(1) + RegistriesModerationMetrics._get_connection().indices.refresh(RegistriesModerationMetrics._template_pattern) res = app.get(base_url, auth=user.auth, expect_errors=True) data = res.json diff --git a/conftest.py b/conftest.py index 7665782cb34..232b788c0fb 100644 --- a/conftest.py +++ b/conftest.py @@ -151,7 +151,8 @@ def _es_metrics_marker(request): hosts=osf_settings.ELASTIC6_URI, ) - class _Es6TestCase(RealElasticTestCase, autosetup_djelme_backends=True): ... + class _Es6TestCase(RealElasticTestCase, autosetup_djelme_backends=True): + ... es6_test_case = _Es6TestCase() es6_test_case.setUp() try: From 4649800a5c85efb7d94daf99c82a001c9348b360 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Wed, 25 Mar 2026 17:17:15 +0200 Subject: [PATCH 009/100] remove unused imports, comment out --- api_tests/metrics/test_composite_query.py | 1 - api_tests/metrics/test_registries_moderation_metrics.py | 1 - osf/metrics/metric_mixin.py | 6 +++--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/api_tests/metrics/test_composite_query.py b/api_tests/metrics/test_composite_query.py index a983ebd74fb..016677c3a11 100644 --- a/api_tests/metrics/test_composite_query.py +++ b/api_tests/metrics/test_composite_query.py @@ -1,4 +1,3 @@ -import time import pytest from datetime import datetime from osf_tests.factories import ( diff --git a/api_tests/metrics/test_registries_moderation_metrics.py b/api_tests/metrics/test_registries_moderation_metrics.py index 7f59a0a417e..0f3dddb79b6 100644 --- a/api_tests/metrics/test_registries_moderation_metrics.py +++ b/api_tests/metrics/test_registries_moderation_metrics.py @@ -1,7 +1,6 @@ import pytest from waffle.testutils import override_switch -import time from osf import features from osf_tests.factories import RegistrationFactory, AuthUserFactory from osf.utils.workflows import RegistrationModerationStates, RegistrationModerationTriggers diff --git a/osf/metrics/metric_mixin.py b/osf/metrics/metric_mixin.py index df87d5123b1..68fa255b073 100644 --- a/osf/metrics/metric_mixin.py +++ b/osf/metrics/metric_mixin.py @@ -77,9 +77,9 @@ def _get_id_to_count(cls, size, metric_field, count_field, after=None, before=No # indices, determined from `after` @classmethod def search(cls, using=None, index=None, after=None, before=None, *args, **kwargs): - if not index and (before or after): - indices = cls._get_relevant_indices(after, before) - index = ','.join(indices) + # if not index and (before or after): + # indices = cls._get_relevant_indices(after, before) + # index = ','.join(indices) return super().search(using=using, index=index, *args, **kwargs) @classmethod From 5ea0ed1bdbfac74fc910e7b095d049c712dc75ae Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 25 Mar 2026 11:47:09 -0400 Subject: [PATCH 010/100] chore: bump djelme dependency --- osf/metrics/metric_mixin.py | 6 +-- .../management_commands/test_reindex_es6.py | 2 +- poetry.lock | 46 +++++++++---------- pyproject.toml | 2 +- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/osf/metrics/metric_mixin.py b/osf/metrics/metric_mixin.py index 68fa255b073..df87d5123b1 100644 --- a/osf/metrics/metric_mixin.py +++ b/osf/metrics/metric_mixin.py @@ -77,9 +77,9 @@ def _get_id_to_count(cls, size, metric_field, count_field, after=None, before=No # indices, determined from `after` @classmethod def search(cls, using=None, index=None, after=None, before=None, *args, **kwargs): - # if not index and (before or after): - # indices = cls._get_relevant_indices(after, before) - # index = ','.join(indices) + if not index and (before or after): + indices = cls._get_relevant_indices(after, before) + index = ','.join(indices) return super().search(using=using, index=index, *args, **kwargs) @classmethod diff --git a/osf_tests/management_commands/test_reindex_es6.py b/osf_tests/management_commands/test_reindex_es6.py index 5e01be656a8..36158c18da6 100644 --- a/osf_tests/management_commands/test_reindex_es6.py +++ b/osf_tests/management_commands/test_reindex_es6.py @@ -10,7 +10,7 @@ AuthUserFactory ) -from elasticsearch_metrics.field import Keyword +from elasticsearch6_dsl import Keyword from tests.json_api_test_app import JSONAPITestApp diff --git a/poetry.lock b/poetry.lock index 530b6252e18..f0dca07d95c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.3.2 and should not be changed by hand. [[package]] name = "amqp" @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "7a7f664469070dd52dc4d9401f6b6d2d9fe7ddf0" -resolved_reference = "7a7f664469070dd52dc4d9401f6b6d2d9fe7ddf0" +reference = "bb1c84c148ac1d2b1079b2b113e52a01a861c8a6" +resolved_reference = "bb1c84c148ac1d2b1079b2b113e52a01a861c8a6" [[package]] name = "django-extensions" @@ -1186,7 +1186,7 @@ files = [ [package.dependencies] autopep8 = "*" Django = ">=3.2" -gprof2dot = ">=2017.09.19" +gprof2dot = ">=2017.9.19" sqlparse = "*" [[package]] @@ -1739,12 +1739,12 @@ files = [ [package.dependencies] google-auth = ">=2.14.1,<3.0.dev0" googleapis-common-protos = ">=1.56.2,<2.0.dev0" -proto-plus = ">=1.22.3,<2.0.0dev" +proto-plus = ">=1.22.3,<2.0.0.dev0" protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0.dev0" requests = ">=2.18.0,<3.0.0.dev0" [package.extras] -grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\""] +grpc = ["grpcio (>=1.33.2,<2.0.dev0)", "grpcio (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\""] grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] @@ -1820,11 +1820,11 @@ files = [ ] [package.dependencies] -google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0dev" -google-auth = ">=1.25.0,<3.0dev" +google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0.dev0" +google-auth = ">=1.25.0,<3.0.dev0" [package.extras] -grpc = ["grpcio (>=1.38.0,<2.0dev)", "grpcio-status (>=1.38.0,<2.0.dev0)"] +grpc = ["grpcio (>=1.38.0,<2.0.dev0)", "grpcio-status (>=1.38.0,<2.0.dev0)"] [[package]] name = "google-cloud-storage" @@ -1839,15 +1839,15 @@ files = [ ] [package.dependencies] -google-api-core = ">=2.15.0,<3.0.0dev" -google-auth = ">=2.26.1,<3.0dev" -google-cloud-core = ">=2.3.0,<3.0dev" -google-crc32c = ">=1.0,<2.0dev" +google-api-core = ">=2.15.0,<3.0.0.dev0" +google-auth = ">=2.26.1,<3.0.dev0" +google-cloud-core = ">=2.3.0,<3.0.dev0" +google-crc32c = ">=1.0,<2.0.dev0" google-resumable-media = ">=2.6.0" -requests = ">=2.18.0,<3.0.0dev" +requests = ">=2.18.0,<3.0.0.dev0" [package.extras] -protobuf = ["protobuf (<5.0.0dev)"] +protobuf = ["protobuf (<5.0.0.dev0)"] [[package]] name = "google-crc32c" @@ -1902,11 +1902,11 @@ files = [ ] [package.dependencies] -google-crc32c = ">=1.0,<2.0dev" +google-crc32c = ">=1.0,<2.0.dev0" [package.extras] -aiohttp = ["aiohttp (>=3.6.2,<4.0.0dev)", "google-auth (>=1.22.0,<2.0dev)"] -requests = ["requests (>=2.18.0,<3.0.0dev)"] +aiohttp = ["aiohttp (>=3.6.2,<4.0.0.dev0)", "google-auth (>=1.22.0,<2.0.dev0)"] +requests = ["requests (>=2.18.0,<3.0.0.dev0)"] [[package]] name = "googleapis-common-protos" @@ -2285,7 +2285,7 @@ files = [ [package.dependencies] attrs = ">=22.2.0" -jsonschema-specifications = ">=2023.03.6" +jsonschema-specifications = ">=2023.3.6" referencing = ">=0.28.4" rpds-py = ">=0.7.1" @@ -3042,7 +3042,7 @@ files = [ ] [package.dependencies] -protobuf = ">=3.19.0,<6.0.0dev" +protobuf = ">=3.19.0,<6.0.0.dev0" [package.extras] testing = ["google-api-core (>=1.31.5)"] @@ -4072,10 +4072,10 @@ files = [ ] [package.dependencies] -botocore = ">=1.33.2,<2.0a.0" +botocore = ">=1.33.2,<2.0a0" [package.extras] -crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"] +crt = ["botocore[crt] (>=1.33.2,<2.0a0)"] [[package]] name = "schema" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "755718b3333d5fe7983ac875532429e80eb2d45684002ae9212bc96a7800d014" +content-hash = "c3108a036ae092e35f7696ffe230e271b774bb12e546db77bb8b12b5fb6eca7d" diff --git a/pyproject.toml b/pyproject.toml index f40ad4fb107..b40cdc704ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "7a7f664469070dd52dc4d9401f6b6d2d9fe7ddf0"} # branch is feature/pin-esdsl +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "bb1c84c148ac1d2b1079b2b113e52a01a861c8a6"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From 674f963f388689fcd67bbbcbad26468e09cc86b5 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 25 Mar 2026 10:29:17 -0400 Subject: [PATCH 011/100] wip: es8 djelme records (migration targets) --- osf/metrics/es8_metrics.py | 221 +++++++++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 osf/metrics/es8_metrics.py diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py new file mode 100644 index 00000000000..ec20215449e --- /dev/null +++ b/osf/metrics/es8_metrics.py @@ -0,0 +1,221 @@ +from __future__ import annotations +import datetime + +import elasticsearch8.dsl as esdsl +import elasticsearch_metrics.imps.elastic8 as djelme + +from osf.metrics.utils import YearMonth + + +### +# custom dsl fields + +class YearmonthField(esdsl.Date): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs, format='strict_year_month') + + def deserialize(self, data): + if isinstance(data, int): + # elasticsearch stores dates in milliseconds since the unix epoch + _as_datetime = datetime.datetime.fromtimestamp(data // 1000) + return YearMonth.from_date(_as_datetime) + elif data is None: + return None + try: + return YearMonth.from_any(data) + except ValueError: + raise ValueError(f'unsure how to deserialize "{data}" (of type {type(data)}) to YearMonth') + + def serialize(self, data): + if isinstance(data, str): + return data + elif isinstance(data, YearMonth): + return str(data) + elif isinstance(data, (datetime.datetime, datetime.date)): + return str(YearMonth.from_date(data)) + elif data is None: + return None + else: + raise ValueError(f'unsure how to serialize "{data}" (of type {type(data)}) as YYYY-MM') + + +### +# inner objects for events + +route_prefix_analyzer = esdsl.analyzer( + 'route_prefix_analyzer', + tokenizer=esdsl.tokenizer('route_prefix_tokenizer', 'path_hierarchy', delimiter='.'), +) + + +class PageviewInfo(esdsl.InnerDoc): + """PageviewInfo + + for CountedAuthUsage generated by viewing a web page + """ + # fields that should be provided + referer_url: str + page_url: str + page_title: str + route_name: str = esdsl.mapped_field(esdsl.Keyword( + fields={ + 'by_prefix': esdsl.Text(analyzer=route_prefix_analyzer), + }, + )) + + # fields autofilled from the above (see `_autofill_fields`) + page_path: str + referer_domain: str + hour_of_day: str + + +### +# Event records + +class OsfCountedUsageRecord(djelme.CountedUsageRecord): + ''' + + inherited fields: + platform_iri: str + database_iri: str + item_iri: str + sessionhour_id: str + within_iris: list[str] + ''' + # osf-specific fields + item_osfid: str + item_type: str + item_public: bool + user_is_authenticated: bool + action_labels: list[str] + pageview_info: PageviewInfo + + +### +# Reusable inner objects for reports + +class RunningTotal(esdsl.InnerDoc): + total: int + total_daily: int + + +class FileRunningTotals(esdsl.InnerDoc): + total: int + public: int + private: int + total_daily: int + public_daily: int + private_daily: int + + +class NodeRunningTotals(esdsl.InnerDoc): + total: int + total_excluding_spam: int + public: int + private: int + total_daily: int + total_daily_excluding_spam: int + public_daily: int + private_daily: int + + +class RegistrationRunningTotals(esdsl.InnerDoc): + total: int + public: int + embargoed: int + embargoed_v2: int + withdrawn: int + total_daily: int + public_daily: int + embargoed_daily: int + embargoed_v2_daily: int + withdrawn_daily: int + + +### +# Cyclic reports + + +class SpamSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): + node_confirmed_spam: int + node_confirmed_ham: int + node_flagged: int + registration_confirmed_spam: int + registration_confirmed_ham: int + registration_flagged: int + preprint_confirmed_spam: int + preprint_confirmed_ham: int + preprint_flagged: int + user_marked_as_spam: int + user_marked_as_ham: int + + +class InstitutionalUserReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): + # TODO: UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', 'user_id',) + institution_id: str + # user info: + user_id: str + user_name: str + department_name: str + month_last_login = YearmonthField() + month_last_active = YearmonthField() + account_creation_date = YearmonthField() + orcid_id: str + # counts: + public_project_count: int + private_project_count: int + public_registration_count: int + embargoed_registration_count: int + published_preprint_count: int + public_file_count: int = esdsl.mapped_field(esdsl.Long()) + storage_byte_count: int = esdsl.mapped_field(esdsl.Long()) + + +class InstitutionMonthlySummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', ) + institution_id: str + user_count: int + public_project_count: int + private_project_count: int + public_registration_count: int + embargoed_registration_count: int + published_preprint_count: int + storage_byte_count: int = esdsl.mapped_field(esdsl.Long()) + public_file_count: int = esdsl.mapped_field(esdsl.Long()) + monthly_logged_in_user_count: int = esdsl.mapped_field(esdsl.Long()) + monthly_active_user_count: int = esdsl.mapped_field(esdsl.Long()) + + +class PublicItemUsageReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): + # TODO: UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'item_osfid') + + # where noted, fields are meant to correspond to defined terms from COUNTER + # https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html + # https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html + item_osfid: str + item_type: list[str] # counter:Data-Type + provider_id: list[str] # counter:Database(?) + platform_iri: list[str] # counter:Platform + + # view counts include views on components or files contained by this item + view_count: int = esdsl.mapped_field(esdsl.Long()) + view_session_count: int = esdsl.mapped_field(esdsl.Long()) + cumulative_view_count: int = esdsl.mapped_field(esdsl.Long()) + cumulative_view_session_count: int = esdsl.mapped_field(esdsl.Long()) + + # download counts of this item only (not including contained components or files) + download_count: int = esdsl.mapped_field(esdsl.Long()) + download_session_count: int = esdsl.mapped_field(esdsl.Long()) + cumulative_download_count: int = esdsl.mapped_field(esdsl.Long()) + cumulative_download_session_count: int = esdsl.mapped_field(esdsl.Long()) + + +class PrivateSpamMetricsReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): + node_oopspam_flagged: int + node_oopspam_hammed: int + node_akismet_flagged: int + node_akismet_hammed: int + preprint_oopspam_flagged: int + preprint_oopspam_hammed: int + preprint_akismet_flagged: int + preprint_akismet_hammed: int From 2e73161b508a73e192ae3675f60ec05569502848 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Wed, 1 Apr 2026 00:52:14 +0300 Subject: [PATCH 012/100] add new metrics --- osf/metrics/es8_metrics.py | 166 ++++++++++++++++++++++++++++++++++++- 1 file changed, 162 insertions(+), 4 deletions(-) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index ec20215449e..4c1b2de4a2d 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -1,6 +1,6 @@ from __future__ import annotations import datetime - +import enum import elasticsearch8.dsl as esdsl import elasticsearch_metrics.imps.elastic8 as djelme @@ -91,6 +91,100 @@ class OsfCountedUsageRecord(djelme.CountedUsageRecord): pageview_info: PageviewInfo +class CountedAuthUsage(djelme.CountedUsageRecord): + """CountedAuthUsage + + Something was used! Let's quickly take note of that and + move on, then come back later to query/analyze/investigate. + + Aim to support a COUNTER-style reporting api + (see https://cop5.projectcounter.org/en/5.0.2/) + """ + + # where noted, fields correspond to defined terms from COUNTER + # https://cop5.projectcounter.org/en/5.0.2/appendices/a-glossary-of-terms.html + platform_iri: str + provider_id: str + session_id: str + item_guid: str + item_type: str + surrounding_guids: list[str] + item_public: bool + user_is_authenticated: bool + action_labels: list[str] + class ActionLabel(enum.Enum): + SEARCH = 'search' # counter:Search + VIEW = 'view' # counter:Investigation + DOWNLOAD = 'download' # counter:Request + WEB = 'web' # counter:Regular (aka "pageview") + API = 'api' # counter:TDM (aka "non-web api usage") + # TODO: count api usage, distinguish between web and non-web api requests + + # pageviews get additional info to support the "node analytics" view + # (see `api.metrics.views.NodeAnalyticsQuery`) + pageview_info: PageviewInfo + + class Meta: + dynamic = djelme.MetaField('strict') + source = djelme.MetaField(enabled=True) + + +class BasePreprintMetrics(djelme.CountedUsageRecord): + ''' + inherited fields: + platform_iri: str + database_iri: str + item_iri: str + sessionhour_id: str + within_iris: list[str] + ''' + count: int + provider_id: str + user_id: str + preprint_id: str + version: str + path: str + + class Index: + settings = { + 'number_of_shards': 1, + 'number_of_replicas': 1, + 'refresh_interval': '1s', + } + + class Meta: + abstract = True + source = djelme.MetaField(enabled=True) + + +class PreprintView(BasePreprintMetrics): + pass + + +class PreprintDownload(BasePreprintMetrics): + pass + + +class RegistriesModerationMetrics(djelme.CountedUsageRecord): + registration_id: str + provider_id: str + trigger: str + from_state: str + to_state: str + user_id: str + comment: str + + class Index: + settings = { + 'number_of_shards': 1, + 'number_of_replicas': 1, + 'refresh_interval': '1s', + } + + class Meta: + source = djelme.MetaField(enabled=True) + + ### # Reusable inner objects for reports @@ -132,10 +226,74 @@ class RegistrationRunningTotals(esdsl.InnerDoc): withdrawn_daily: int +class UsageByStorageAddon(esdsl.InnerDoc): + addon_shortname: str + enabled_usersettings: RunningTotal + linked_usersettings: RunningTotal + deleted_usersettings: RunningTotal + usersetting_links: RunningTotal + connected_nodesettings: RunningTotal + disconnected_nodesettings: RunningTotal + deleted_nodesettings: RunningTotal + + ### # Cyclic reports +class StorageAddonUsage(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + usage_by_addon: UsageByStorageAddon + + +class DownloadCountReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + daily_file_downloads: int + + +class InstitutionSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id',) + + institution_id: str + institution_name: str + users: RunningTotal + nodes: NodeRunningTotals + projects: NodeRunningTotals + registered_nodes: RegistrationRunningTotals + registered_projects: RegistrationRunningTotals + + +class NewUserDomainReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'domain_name',) + + domain_name: str + domain_name: int + + +class NodeSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + nodes: NodeRunningTotals + projects: NodeRunningTotals + registered_nodes: RegistrationRunningTotals + registered_projects: RegistrationRunningTotals + + +class OsfstorageFileCountReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + files: FileRunningTotals + + +class PreprintSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'provider_key',) + provider_key: str + preprint_count: int + + +class UserSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): + active: int + deactivated: int + merged: int + new_users_daily: int + new_users_with_institution_daily: int + unconfirmed: int + + class SpamSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): node_confirmed_spam: int node_confirmed_ham: int @@ -151,7 +309,7 @@ class SpamSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): class InstitutionalUserReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): - # TODO: UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', 'user_id',) + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', 'user_id',) institution_id: str # user info: user_id: str @@ -172,7 +330,7 @@ class InstitutionalUserReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHL class InstitutionMonthlySummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): - UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', ) + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', ) institution_id: str user_count: int public_project_count: int @@ -187,7 +345,7 @@ class InstitutionMonthlySummaryReport(djelme.CyclicRecord, cycle_timedepth=djelm class PublicItemUsageReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): - # TODO: UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'item_osfid') + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'item_osfid') # where noted, fields are meant to correspond to defined terms from COUNTER # https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html From 4b4a4780cadb0361f69757a1db290b08b6d6178e Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Mon, 6 Apr 2026 14:29:56 +0300 Subject: [PATCH 013/100] fix flake8 --- osf/metrics/es8_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 4c1b2de4a2d..e9ed147e858 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -269,7 +269,7 @@ class NewUserDomainReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): class NodeSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): - nodes: NodeRunningTotals + nodes: NodeRunningTotals projects: NodeRunningTotals registered_nodes: RegistrationRunningTotals registered_projects: RegistrationRunningTotals From d3b48e401e9713fda09ebc3633ae2c635a31daba Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Wed, 8 Apr 2026 17:46:42 +0300 Subject: [PATCH 014/100] add tests, use new version of djelme, consolidate into OsfCountedUsageRecord --- osf/metrics/es8_metrics.py | 151 ++++++++++---------------- osf_tests/metrics/test_es8_metrics.py | 42 +++++++ poetry.lock | 6 +- pyproject.toml | 2 +- 4 files changed, 102 insertions(+), 99 deletions(-) create mode 100644 osf_tests/metrics/test_es8_metrics.py diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index e9ed147e858..666f9bae359 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -1,8 +1,8 @@ -from __future__ import annotations import datetime import enum import elasticsearch8.dsl as esdsl import elasticsearch_metrics.imps.elastic8 as djelme +from urllib.parse import urlsplit from osf.metrics.utils import YearMonth @@ -53,6 +53,16 @@ class PageviewInfo(esdsl.InnerDoc): for CountedAuthUsage generated by viewing a web page """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.page_path: str = '' + if self.page_url: self.page_path = urlsplit(self.page_url).path.rstrip('/') + self.referer_domain: str = '' + if self.referer_url: self.referer_domain = urlsplit(self.referer_url).netloc + self.hour_of_day: int = 0 + if self.timestamp: self.hour_of_day = self.timestamp.hour + # fields that should be provided referer_url: str page_url: str @@ -63,11 +73,9 @@ class PageviewInfo(esdsl.InnerDoc): }, )) - # fields autofilled from the above (see `_autofill_fields`) page_path: str referer_domain: str - hour_of_day: str - + hour_of_day: int ### # Event records @@ -90,82 +98,15 @@ class OsfCountedUsageRecord(djelme.CountedUsageRecord): action_labels: list[str] pageview_info: PageviewInfo +class ActionLabel(enum.Enum): + SEARCH = 'search' # counter:Search + VIEW = 'view' # counter:Investigation + DOWNLOAD = 'download' # counter:Request + WEB = 'web' # counter:Regular (aka "pageview") + API = 'api' # counter:TDM (aka "non-web api usage") -class CountedAuthUsage(djelme.CountedUsageRecord): - """CountedAuthUsage - - Something was used! Let's quickly take note of that and - move on, then come back later to query/analyze/investigate. - - Aim to support a COUNTER-style reporting api - (see https://cop5.projectcounter.org/en/5.0.2/) - """ - # where noted, fields correspond to defined terms from COUNTER - # https://cop5.projectcounter.org/en/5.0.2/appendices/a-glossary-of-terms.html - platform_iri: str - provider_id: str - session_id: str - item_guid: str - item_type: str - surrounding_guids: list[str] - item_public: bool - user_is_authenticated: bool - action_labels: list[str] - class ActionLabel(enum.Enum): - SEARCH = 'search' # counter:Search - VIEW = 'view' # counter:Investigation - DOWNLOAD = 'download' # counter:Request - WEB = 'web' # counter:Regular (aka "pageview") - API = 'api' # counter:TDM (aka "non-web api usage") - # TODO: count api usage, distinguish between web and non-web api requests - - # pageviews get additional info to support the "node analytics" view - # (see `api.metrics.views.NodeAnalyticsQuery`) - pageview_info: PageviewInfo - - class Meta: - dynamic = djelme.MetaField('strict') - source = djelme.MetaField(enabled=True) - - -class BasePreprintMetrics(djelme.CountedUsageRecord): - ''' - inherited fields: - platform_iri: str - database_iri: str - item_iri: str - sessionhour_id: str - within_iris: list[str] - ''' - count: int - provider_id: str - user_id: str - preprint_id: str - version: str - path: str - - class Index: - settings = { - 'number_of_shards': 1, - 'number_of_replicas': 1, - 'refresh_interval': '1s', - } - - class Meta: - abstract = True - source = djelme.MetaField(enabled=True) - - -class PreprintView(BasePreprintMetrics): - pass - - -class PreprintDownload(BasePreprintMetrics): - pass - - -class RegistriesModerationMetrics(djelme.CountedUsageRecord): +class Es8RegistriesModerationMetrics(djelme.EventRecord): registration_id: str provider_id: str trigger: str @@ -181,9 +122,6 @@ class Index: 'refresh_interval': '1s', } - class Meta: - source = djelme.MetaField(enabled=True) - ### # Reusable inner objects for reports @@ -241,15 +179,20 @@ class UsageByStorageAddon(esdsl.InnerDoc): # Cyclic reports -class StorageAddonUsage(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): - usage_by_addon: UsageByStorageAddon +class Es8StorageAddonUsage(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = 3 + + usage_by_addon: list[UsageByStorageAddon] + +class Es8DownloadCountReport(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = 3 -class DownloadCountReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): daily_file_downloads: int -class InstitutionSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): +class Es8InstitutionSummaryReport(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = 3 UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id',) institution_id: str @@ -261,31 +204,40 @@ class InstitutionSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY registered_projects: RegistrationRunningTotals -class NewUserDomainReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): +class Es8NewUserDomainReport(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = 3 UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'domain_name',) domain_name: str domain_name: int -class NodeSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): +class Es8NodeSummaryReport(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = 3 + nodes: NodeRunningTotals projects: NodeRunningTotals registered_nodes: RegistrationRunningTotals registered_projects: RegistrationRunningTotals -class OsfstorageFileCountReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): +class Es8OsfstorageFileCountReport(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = 3 + files: FileRunningTotals -class PreprintSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): +class Es8PreprintSummaryReport(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = 3 + UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'provider_key',) provider_key: str preprint_count: int -class UserSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): +class Es8UserSummaryReport(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = 3 + active: int deactivated: int merged: int @@ -294,7 +246,9 @@ class UserSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.DAILY): unconfirmed: int -class SpamSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): +class Es8SpamSummaryReport(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = 2 + node_confirmed_spam: int node_confirmed_ham: int node_flagged: int @@ -308,8 +262,10 @@ class SpamSummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): user_marked_as_ham: int -class InstitutionalUserReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): +class Es8InstitutionalUserReport(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = 2 UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', 'user_id',) + institution_id: str # user info: user_id: str @@ -329,8 +285,10 @@ class InstitutionalUserReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHL storage_byte_count: int = esdsl.mapped_field(esdsl.Long()) -class InstitutionMonthlySummaryReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): +class Es8InstitutionMonthlySummaryReport(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = 2 UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', ) + institution_id: str user_count: int public_project_count: int @@ -344,7 +302,8 @@ class InstitutionMonthlySummaryReport(djelme.CyclicRecord, cycle_timedepth=djelm monthly_active_user_count: int = esdsl.mapped_field(esdsl.Long()) -class PublicItemUsageReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): +class Es8PublicItemUsageReport(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = 2 UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'item_osfid') # where noted, fields are meant to correspond to defined terms from COUNTER @@ -368,7 +327,9 @@ class PublicItemUsageReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY) cumulative_download_session_count: int = esdsl.mapped_field(esdsl.Long()) -class PrivateSpamMetricsReport(djelme.CyclicRecord, cycle_timedepth=djelme.MONTHLY): +class Es8PrivateSpamMetricsReport(djelme.CyclicRecord): + CYCLE_TIMEDEPTH = 2 + node_oopspam_flagged: int node_oopspam_hammed: int node_akismet_flagged: int diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py new file mode 100644 index 00000000000..2afca72174b --- /dev/null +++ b/osf_tests/metrics/test_es8_metrics.py @@ -0,0 +1,42 @@ +from datetime import datetime + +from osf.metrics.es8_metrics import ( + Es8DownloadCountReport, + Es8UserSummaryReport, + OsfCountedUsageRecord, + PageviewInfo +) + + +class TestEs8Metrics: + def test_import_all_reports(self): + assert True + + def test_instantiate_of_reports(self): + download_report = Es8DownloadCountReport() + assert hasattr(download_report, 'daily_file_downloads') + assert download_report.daily_file_downloads is None + + user_report = Es8UserSummaryReport() + assert hasattr(user_report, 'active') + assert user_report.active is None + + def test_nested_pageview(self): + usage = OsfCountedUsageRecord( + pageview_info={ + "page_url": "https://example.com", + "referer_url": "https://google.com", + } + ) + assert usage.pageview_info is not None + + def test_pageview_info_autofill(self): + obj = PageviewInfo( + page_url="https://example.com/path/test", + referer_url="https://google.com", + timestamp=datetime(2024, 1, 1, 15, 0), + ) + + assert obj.page_path == "/path/test" + assert obj.referer_domain == "google.com" + assert obj.hour_of_day == 15 diff --git a/poetry.lock b/poetry.lock index f0dca07d95c..5bbe2ae1f49 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "bb1c84c148ac1d2b1079b2b113e52a01a861c8a6" -resolved_reference = "bb1c84c148ac1d2b1079b2b113e52a01a861c8a6" +reference = "1b644bb927cfb28e3a23b28ad625279749d859e5" +resolved_reference = "1b644bb927cfb28e3a23b28ad625279749d859e5" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "c3108a036ae092e35f7696ffe230e271b774bb12e546db77bb8b12b5fb6eca7d" +content-hash = "3a5ea0758a65dac062ba307a2f29bdb8d637c2b0a78a2f68fea86c39516c6922" diff --git a/pyproject.toml b/pyproject.toml index b40cdc704ab..375b8cacd25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "bb1c84c148ac1d2b1079b2b113e52a01a861c8a6"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "1b644bb927cfb28e3a23b28ad625279749d859e5"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From e4bec9dfa9f5e760bc11df3ce991af32a56e300a Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Thu, 9 Apr 2026 10:45:01 +0300 Subject: [PATCH 015/100] add imports to init, flake8 --- osf/metrics/__init__.py | 18 ++++++++++++++++++ osf/metrics/es8_metrics.py | 9 ++++++--- osf_tests/metrics/test_es8_metrics.py | 12 ++++++------ 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/osf/metrics/__init__.py b/osf/metrics/__init__.py index 0e7b1a1cf32..b2c8af54999 100644 --- a/osf/metrics/__init__.py +++ b/osf/metrics/__init__.py @@ -18,6 +18,17 @@ UserSummaryReport, ) +from .es8_metrics import ( + Es8DownloadCountReport, + Es8UserSummaryReport, + Es8NodeSummaryReport, + Es8SpamSummaryReport, + Es8InstitutionSummaryReport, + Es8NewUserDomainReport, + Es8OsfstorageFileCountReport, + Es8StorageAddonUsage, +) + DAILY_REPORTS = ( DownloadCountReport, InstitutionSummaryReport, @@ -27,6 +38,13 @@ PreprintSummaryReport, StorageAddonUsage, UserSummaryReport, + Es8DownloadCountReport, + Es8InstitutionSummaryReport, + Es8NewUserDomainReport, + Es8NodeSummaryReport, + Es8OsfstorageFileCountReport, + Es8StorageAddonUsage, + Es8UserSummaryReport ) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 666f9bae359..020a9c72c80 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -57,11 +57,14 @@ class PageviewInfo(esdsl.InnerDoc): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.page_path: str = '' - if self.page_url: self.page_path = urlsplit(self.page_url).path.rstrip('/') + if self.page_url: + self.page_path = urlsplit(self.page_url).path.rstrip('/') self.referer_domain: str = '' - if self.referer_url: self.referer_domain = urlsplit(self.referer_url).netloc + if self.referer_url: + self.referer_domain = urlsplit(self.referer_url).netloc self.hour_of_day: int = 0 - if self.timestamp: self.hour_of_day = self.timestamp.hour + if self.timestamp: + self.hour_of_day = self.timestamp.hour # fields that should be provided referer_url: str diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py index 2afca72174b..1158836b688 100644 --- a/osf_tests/metrics/test_es8_metrics.py +++ b/osf_tests/metrics/test_es8_metrics.py @@ -24,19 +24,19 @@ def test_instantiate_of_reports(self): def test_nested_pageview(self): usage = OsfCountedUsageRecord( pageview_info={ - "page_url": "https://example.com", - "referer_url": "https://google.com", + 'page_url': 'https://example.com', + 'referer_url': 'https://google.com', } ) assert usage.pageview_info is not None def test_pageview_info_autofill(self): obj = PageviewInfo( - page_url="https://example.com/path/test", - referer_url="https://google.com", + page_url='https://example.com/path/test', + referer_url='https://google.com', timestamp=datetime(2024, 1, 1, 15, 0), ) - assert obj.page_path == "/path/test" - assert obj.referer_domain == "google.com" + assert obj.page_path == '/path/tes' + assert obj.referer_domain == 'google.com' assert obj.hour_of_day == 15 From ee515ef615b2363724aba1d445ef7e4e15f89c4a Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Thu, 9 Apr 2026 16:39:44 +0300 Subject: [PATCH 016/100] fix test, imports, flake8 --- .docker-compose.env | 2 +- osf/metrics/__init__.py | 1 - osf_tests/metrics/test_es8_metrics.py | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.docker-compose.env b/.docker-compose.env index 449c9747adf..444788ecb46 100644 --- a/.docker-compose.env +++ b/.docker-compose.env @@ -6,7 +6,7 @@ DOMAIN=http://localhost:5000/ INTERNAL_DOMAIN=http://192.168.168.167:5000/ API_DOMAIN=http://localhost:8000/ ELASTIC_URI=192.168.168.167:9200 -ELASTIC6_URI=192.168.168.167:9201 +ELASTIC6_URI=http://192.168.168.167:9201 ELASTIC8_URI=http://192.168.168.167:9202 OSF_DB_HOST=192.168.168.167 DB_HOST=192.168.168.167 diff --git a/osf/metrics/__init__.py b/osf/metrics/__init__.py index b2c8af54999..6cef14f5cf9 100644 --- a/osf/metrics/__init__.py +++ b/osf/metrics/__init__.py @@ -22,7 +22,6 @@ Es8DownloadCountReport, Es8UserSummaryReport, Es8NodeSummaryReport, - Es8SpamSummaryReport, Es8InstitutionSummaryReport, Es8NewUserDomainReport, Es8OsfstorageFileCountReport, diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py index 1158836b688..3d48a3d35c4 100644 --- a/osf_tests/metrics/test_es8_metrics.py +++ b/osf_tests/metrics/test_es8_metrics.py @@ -37,6 +37,6 @@ def test_pageview_info_autofill(self): timestamp=datetime(2024, 1, 1, 15, 0), ) - assert obj.page_path == '/path/tes' + assert obj.page_path == '/path/test' assert obj.referer_domain == 'google.com' assert obj.hour_of_day == 15 From ca60b58e0dc08d9f81ca085df45f43792d3ed252 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Fri, 10 Apr 2026 17:23:02 +0300 Subject: [PATCH 017/100] add security, flake8, fixes, add to test-build.yml --- .docker-compose.env | 5 ++++- .github/workflows/test-build.yml | 18 ++++++++++++++++++ api/base/settings/defaults.py | 8 +++++++- docker-compose.yml | 13 ++++++++++++- website/settings/defaults.py | 5 ++++- 5 files changed, 45 insertions(+), 4 deletions(-) diff --git a/.docker-compose.env b/.docker-compose.env index 444788ecb46..2542d16e841 100644 --- a/.docker-compose.env +++ b/.docker-compose.env @@ -6,8 +6,11 @@ DOMAIN=http://localhost:5000/ INTERNAL_DOMAIN=http://192.168.168.167:5000/ API_DOMAIN=http://localhost:8000/ ELASTIC_URI=192.168.168.167:9200 -ELASTIC6_URI=http://192.168.168.167:9201 +ELASTIC6_URI=192.168.168.167:9201 ELASTIC8_URI=http://192.168.168.167:9202 +ELASTIC8_CERT_PATH=/elastic8_certs/ca/ca.crt +ELASTIC8_USERNAME=elastic +ELASTIC8_SECRET=secretsecret OSF_DB_HOST=192.168.168.167 DB_HOST=192.168.168.167 REDIS_HOST=redis://192.168.168.167:6379 diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 33942968529..0e8541acf2a 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -64,6 +64,14 @@ jobs: permissions: checks: write services: + elasticsearch8: + image: elasticsearch:8.19.11 + ports: + - 9202:9200 + env: + xpack.security.enabled: false + node.name: singlenode + cluster.initial_master_nodes: singlenode postgres: image: postgres env: @@ -84,6 +92,8 @@ jobs: - name: Upload report if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report + env: + ELASTIC8_URL: http://localhost:9202 api1_and_js: runs-on: ubuntu-22.04 @@ -208,6 +218,14 @@ jobs: checks: write needs: build-cache services: + elasticsearch8: + image: elasticsearch:8.19.11 + ports: + - 9202:9200 + env: + xpack.security.enabled: false + node.name: singlenode + cluster.initial_master_nodes: singlenode postgres: image: postgres diff --git a/api/base/settings/defaults.py b/api/base/settings/defaults.py index 816586ffcfb..42e8d9bd495 100644 --- a/api/base/settings/defaults.py +++ b/api/base/settings/defaults.py @@ -316,7 +316,7 @@ HASHIDS_SALT = 'pinkhimalayan' # django-elasticsearch-metrics -DJELME_AUTOSETUP = True +# DJELME_AUTOSETUP = True DJELME_BACKENDS = { 'osfmetrics_es6': { 'elasticsearch_metrics.imps.elastic6': { @@ -327,6 +327,12 @@ 'osfmetrics_es8': { 'elasticsearch_metrics.imps.elastic8': { 'hosts': osf_settings.ELASTIC8_URI, + 'ca_certs': osf_settings.ELASTIC8_CERT_PATH, + 'basic_auth': ( + (osf_settings.ELASTIC8_USERNAME, osf_settings.ELASTIC8_SECRET) + if osf_settings.ELASTIC8_SECRET is not None + else None + ), }, }, } diff --git a/docker-compose.yml b/docker-compose.yml index f26c3617b67..09aedd58247 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -82,8 +82,19 @@ services: image: docker.elastic.co/elasticsearch/elasticsearch:8.19.11 platform: linux/arm64 environment: - - xpack.security.enabled=false + - ELASTIC_PASSWORD=secretsecret + - node.name=singlenode - discovery.type=single-node + - xpack.security.enabled=true + - xpack.security.http.ssl.enabled=true + - xpack.security.http.ssl.key=/elastic8_certs/singlenode/singlenode.key + - xpack.security.http.ssl.certificate=/elastic8_certs/singlenode/singlenode.crt + - xpack.security.http.ssl.certificate_authorities=/elastic8_certs/ca/ca.crt + - xpack.security.transport.ssl.enabled=true + - xpack.security.transport.ssl.key=/elastic8_certs/singlenode/singlenode.key + - xpack.security.transport.ssl.certificate=/elastic8_certs/singlenode/singlenode.crt + - xpack.security.transport.ssl.certificate_authorities=/elastic8_certs/ca/ca.crt + - xpack.security.transport.ssl.verification_mode=certificate ports: - 9202:9200 volumes: diff --git a/website/settings/defaults.py b/website/settings/defaults.py index d0ae58dc863..1e8032cc95c 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -113,7 +113,10 @@ def parent_dir(path): SEARCH_ENGINE = 'elastic' # Can be 'elastic', or None ELASTIC_URI = '127.0.0.1:9200' ELASTIC6_URI = os.environ.get('ELASTIC6_URI', '127.0.0.1:9201') -ELASTIC8_URI = os.environ.get('ELASTIC8_URI', '127.0.0.1:9202') +ELASTIC8_URI = os.environ.get('ELASTIC8_URI') +ELASTIC8_CERT_PATH = os.environ.get('ELASTIC8_CERT_PATH') +ELASTIC8_USERNAME = os.environ.get('ELASTIC8_USERNAME', 'elastic') +ELASTIC8_SECRET = os.environ.get('ELASTIC8_SECRET') ELASTIC_TIMEOUT = 10 ELASTIC_INDEX = 'website' ELASTIC_KWARGS = { From 080daf69dbcd839ed7d712c7f78053b13097b1e1 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Sat, 11 Apr 2026 00:01:31 +0300 Subject: [PATCH 018/100] test-build update --- .github/workflows/test-build.yml | 41 +++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 0e8541acf2a..6aa39e39800 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -12,6 +12,7 @@ env: OSF_DB_PORT: 5432 OSF_DB_PASSWORD: postgres GITHUB_ACTIONS: true + ELASTIC8_URL: http://localhost:9202 jobs: build-cache: @@ -37,6 +38,14 @@ jobs: permissions: checks: write services: + elasticsearch8: + image: elasticsearch:8.19.11 + ports: + - 9202:9200 + env: + xpack.security.enabled: false + node.name: singlenode + cluster.initial_master_nodes: singlenode postgres: image: postgres env: @@ -57,6 +66,8 @@ jobs: - name: Upload report if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report + env: + ELASTIC8_URL: ${{ env.OSF_DB_PASSWORD }} website: runs-on: ubuntu-22.04 @@ -93,7 +104,7 @@ jobs: if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report env: - ELASTIC8_URL: http://localhost:9202 + ELASTIC8_URL: ${{ env.OSF_DB_PASSWORD }} api1_and_js: runs-on: ubuntu-22.04 @@ -101,6 +112,14 @@ jobs: permissions: checks: write services: + elasticsearch8: + image: elasticsearch:8.19.11 + ports: + - 9202:9200 + env: + xpack.security.enabled: false + node.name: singlenode + cluster.initial_master_nodes: singlenode postgres: image: postgres env: @@ -123,6 +142,8 @@ jobs: - name: Upload report if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report + env: + ELASTIC8_URL: ${{ env.OSF_DB_PASSWORD }} api2: runs-on: ubuntu-22.04 @@ -130,6 +151,14 @@ jobs: permissions: checks: write services: + elasticsearch8: + image: elasticsearch:8.19.11 + ports: + - 9202:9200 + env: + xpack.security.enabled: false + node.name: singlenode + cluster.initial_master_nodes: singlenode postgres: image: postgres env: @@ -150,6 +179,8 @@ jobs: - name: Upload report if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report + env: + ELASTIC8_URL: ${{ env.OSF_DB_PASSWORD }} api3_and_osf: runs-on: ubuntu-22.04 @@ -218,14 +249,6 @@ jobs: checks: write needs: build-cache services: - elasticsearch8: - image: elasticsearch:8.19.11 - ports: - - 9202:9200 - env: - xpack.security.enabled: false - node.name: singlenode - cluster.initial_master_nodes: singlenode postgres: image: postgres From fde32a4ee09debee75af5523088fd8c3c921f713 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Sat, 11 Apr 2026 01:19:07 +0300 Subject: [PATCH 019/100] test-build fix url --- .github/workflows/test-build.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 6aa39e39800..60d056de001 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -67,7 +67,7 @@ jobs: if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report env: - ELASTIC8_URL: ${{ env.OSF_DB_PASSWORD }} + ELASTIC8_URL: ${{ env.ELASTIC8_URL }} website: runs-on: ubuntu-22.04 @@ -104,7 +104,7 @@ jobs: if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report env: - ELASTIC8_URL: ${{ env.OSF_DB_PASSWORD }} + ELASTIC8_URL: ${{ env.ELASTIC8_URL }} api1_and_js: runs-on: ubuntu-22.04 @@ -143,7 +143,7 @@ jobs: if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report env: - ELASTIC8_URL: ${{ env.OSF_DB_PASSWORD }} + ELASTIC8_URL: ${{ env.ELASTIC8_URL }} api2: runs-on: ubuntu-22.04 @@ -180,7 +180,7 @@ jobs: if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report env: - ELASTIC8_URL: ${{ env.OSF_DB_PASSWORD }} + ELASTIC8_URL: ${{ env.ELASTIC8_URL }} api3_and_osf: runs-on: ubuntu-22.04 From e6da70bbf73cbbf348fddf148d91b03472913e9e Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Sat, 11 Apr 2026 01:39:47 +0300 Subject: [PATCH 020/100] test-build fix naming --- .github/workflows/test-build.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 60d056de001..fdfd4c190b3 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -12,7 +12,7 @@ env: OSF_DB_PORT: 5432 OSF_DB_PASSWORD: postgres GITHUB_ACTIONS: true - ELASTIC8_URL: http://localhost:9202 + ELASTIC8_URI: http://localhost:9202 jobs: build-cache: @@ -67,7 +67,7 @@ jobs: if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report env: - ELASTIC8_URL: ${{ env.ELASTIC8_URL }} + ELASTIC8_URI: ${{ env.ELASTIC8_URI }} website: runs-on: ubuntu-22.04 @@ -104,7 +104,7 @@ jobs: if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report env: - ELASTIC8_URL: ${{ env.ELASTIC8_URL }} + ELASTIC8_URI: ${{ env.ELASTIC8_URI }} api1_and_js: runs-on: ubuntu-22.04 @@ -143,7 +143,7 @@ jobs: if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report env: - ELASTIC8_URL: ${{ env.ELASTIC8_URL }} + ELASTIC8_URI: ${{ env.ELASTIC8_URI }} api2: runs-on: ubuntu-22.04 @@ -180,7 +180,7 @@ jobs: if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report env: - ELASTIC8_URL: ${{ env.ELASTIC8_URL }} + ELASTIC8_URI: ${{ env.ELASTIC8_URI }} api3_and_osf: runs-on: ubuntu-22.04 From 2b8a81c10b13e687c29144acf55b699d73ac3a2d Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Sat, 11 Apr 2026 08:55:02 +0300 Subject: [PATCH 021/100] update test --- osf_tests/metrics/test_es8_metrics.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py index 3d48a3d35c4..28dedd01eb4 100644 --- a/osf_tests/metrics/test_es8_metrics.py +++ b/osf_tests/metrics/test_es8_metrics.py @@ -15,11 +15,9 @@ def test_import_all_reports(self): def test_instantiate_of_reports(self): download_report = Es8DownloadCountReport() assert hasattr(download_report, 'daily_file_downloads') - assert download_report.daily_file_downloads is None user_report = Es8UserSummaryReport() assert hasattr(user_report, 'active') - assert user_report.active is None def test_nested_pageview(self): usage = OsfCountedUsageRecord( From 6167778672af9f5d87ede22d31ce71b0863d09fc Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Mon, 13 Apr 2026 12:24:40 +0300 Subject: [PATCH 022/100] add wait --- .github/workflows/test-build.yml | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index fdfd4c190b3..844ca93fb15 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -61,13 +61,23 @@ jobs: steps: - uses: actions/checkout@v2 - uses: ./.github/actions/start-build + - name: Wait for Elasticsearch + run: | + echo "Waiting for Elasticsearch..." + for i in {1..30}; do + if curl -sf http://localhost:9202/_cluster/health?wait_for_status=yellow; then + echo "Elasticsearch is ready" + exit 0 + fi + sleep 2 + done + echo "Elasticsearch failed" + exit 1 - name: Run tests run: poetry run python3 -m invoke test-ci-addons --junit - name: Upload report if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report - env: - ELASTIC8_URI: ${{ env.ELASTIC8_URI }} website: runs-on: ubuntu-22.04 @@ -103,8 +113,6 @@ jobs: - name: Upload report if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report - env: - ELASTIC8_URI: ${{ env.ELASTIC8_URI }} api1_and_js: runs-on: ubuntu-22.04 @@ -142,8 +150,6 @@ jobs: - name: Upload report if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report - env: - ELASTIC8_URI: ${{ env.ELASTIC8_URI }} api2: runs-on: ubuntu-22.04 @@ -179,8 +185,6 @@ jobs: - name: Upload report if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report - env: - ELASTIC8_URI: ${{ env.ELASTIC8_URI }} api3_and_osf: runs-on: ubuntu-22.04 From eb0a5d9efe8cc6c683550acf99be5047b766e07e Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Mon, 13 Apr 2026 17:57:22 +0300 Subject: [PATCH 023/100] remove wait --- .github/workflows/test-build.yml | 12 ------------ poetry.lock | 6 +++--- pyproject.toml | 2 +- 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 844ca93fb15..d6bf817b7c6 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -61,18 +61,6 @@ jobs: steps: - uses: actions/checkout@v2 - uses: ./.github/actions/start-build - - name: Wait for Elasticsearch - run: | - echo "Waiting for Elasticsearch..." - for i in {1..30}; do - if curl -sf http://localhost:9202/_cluster/health?wait_for_status=yellow; then - echo "Elasticsearch is ready" - exit 0 - fi - sleep 2 - done - echo "Elasticsearch failed" - exit 1 - name: Run tests run: poetry run python3 -m invoke test-ci-addons --junit - name: Upload report diff --git a/poetry.lock b/poetry.lock index 5bbe2ae1f49..d524525f564 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "1b644bb927cfb28e3a23b28ad625279749d859e5" -resolved_reference = "1b644bb927cfb28e3a23b28ad625279749d859e5" +reference = "c43abd63c623cdfbfaf87da6194d2a6f74ac2dd5" +resolved_reference = "c43abd63c623cdfbfaf87da6194d2a6f74ac2dd5" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "3a5ea0758a65dac062ba307a2f29bdb8d637c2b0a78a2f68fea86c39516c6922" +content-hash = "fd91980689d1fa7c440e0c81a0b0e9543445821350cb154f18c63f236c0898be" diff --git a/pyproject.toml b/pyproject.toml index 375b8cacd25..fb008eb2c41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "1b644bb927cfb28e3a23b28ad625279749d859e5"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "c43abd63c623cdfbfaf87da6194d2a6f74ac2dd5"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From 78ed96fdc13cfb70d509d9e55f2028e9f789adf5 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Tue, 14 Apr 2026 12:50:11 +0300 Subject: [PATCH 024/100] cleanup --- .github/workflows/test-build.yml | 37 +++++++++++--------------------- api/base/settings/defaults.py | 1 - 2 files changed, 12 insertions(+), 26 deletions(-) diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index d6bf817b7c6..8d1d3ebb318 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -12,7 +12,6 @@ env: OSF_DB_PORT: 5432 OSF_DB_PASSWORD: postgres GITHUB_ACTIONS: true - ELASTIC8_URI: http://localhost:9202 jobs: build-cache: @@ -61,8 +60,20 @@ jobs: steps: - uses: actions/checkout@v2 - uses: ./.github/actions/start-build + - name: Wait for Elasticsearch + run: | + echo "Waiting for ES8 health..." + sleep 5 + until curl -sf http://localhost:9202/_cluster/health?wait_for_status=yellow; do + echo "ES8 not ready yet..." + sleep 5 + done + + echo "ES8 started successfully!" - name: Run tests run: poetry run python3 -m invoke test-ci-addons --junit + env: + ELASTIC8_URI: http://localhost:9202 - name: Upload report if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report @@ -73,14 +84,6 @@ jobs: permissions: checks: write services: - elasticsearch8: - image: elasticsearch:8.19.11 - ports: - - 9202:9200 - env: - xpack.security.enabled: false - node.name: singlenode - cluster.initial_master_nodes: singlenode postgres: image: postgres env: @@ -108,14 +111,6 @@ jobs: permissions: checks: write services: - elasticsearch8: - image: elasticsearch:8.19.11 - ports: - - 9202:9200 - env: - xpack.security.enabled: false - node.name: singlenode - cluster.initial_master_nodes: singlenode postgres: image: postgres env: @@ -145,14 +140,6 @@ jobs: permissions: checks: write services: - elasticsearch8: - image: elasticsearch:8.19.11 - ports: - - 9202:9200 - env: - xpack.security.enabled: false - node.name: singlenode - cluster.initial_master_nodes: singlenode postgres: image: postgres env: diff --git a/api/base/settings/defaults.py b/api/base/settings/defaults.py index 42e8d9bd495..72e169c25a1 100644 --- a/api/base/settings/defaults.py +++ b/api/base/settings/defaults.py @@ -316,7 +316,6 @@ HASHIDS_SALT = 'pinkhimalayan' # django-elasticsearch-metrics -# DJELME_AUTOSETUP = True DJELME_BACKENDS = { 'osfmetrics_es6': { 'elasticsearch_metrics.imps.elastic6': { From 70cf5e2442fb8d8c271a1f8ab7d1b8b63191c0d0 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Tue, 14 Apr 2026 13:13:36 +0300 Subject: [PATCH 025/100] add wait, downgrade djelme, flake8 --- .github/workflows/test-build.yml | 20 +++++++++++++++++++- poetry.lock | 6 +++--- pyproject.toml | 2 +- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 8d1d3ebb318..09fbbb5b319 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -68,7 +68,6 @@ jobs: echo "ES8 not ready yet..." sleep 5 done - echo "ES8 started successfully!" - name: Run tests run: poetry run python3 -m invoke test-ci-addons --junit @@ -111,6 +110,14 @@ jobs: permissions: checks: write services: + elasticsearch8: + image: elasticsearch:8.19.11 + ports: + - 9202:9200 + env: + xpack.security.enabled: false + node.name: singlenode + cluster.initial_master_nodes: singlenode postgres: image: postgres env: @@ -128,8 +135,19 @@ jobs: - uses: ./.github/actions/start-build - name: NVM & yarn install run: poetry run python3 -m invoke assets --dev + - name: Wait for Elasticsearch + run: | + echo "Waiting for ES8 health..." + sleep 5 + until curl -sf http://localhost:9202/_cluster/health?wait_for_status=yellow; do + echo "ES8 not ready yet..." + sleep 5 + done + echo "ES8 started successfully!" - name: Run test run: poetry run python3 -m invoke test-ci-api1-and-js --junit + env: + ELASTIC8_URI: http://localhost:9202 - name: Upload report if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report diff --git a/poetry.lock b/poetry.lock index d524525f564..5bbe2ae1f49 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "c43abd63c623cdfbfaf87da6194d2a6f74ac2dd5" -resolved_reference = "c43abd63c623cdfbfaf87da6194d2a6f74ac2dd5" +reference = "1b644bb927cfb28e3a23b28ad625279749d859e5" +resolved_reference = "1b644bb927cfb28e3a23b28ad625279749d859e5" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "fd91980689d1fa7c440e0c81a0b0e9543445821350cb154f18c63f236c0898be" +content-hash = "3a5ea0758a65dac062ba307a2f29bdb8d637c2b0a78a2f68fea86c39516c6922" diff --git a/pyproject.toml b/pyproject.toml index fb008eb2c41..375b8cacd25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "c43abd63c623cdfbfaf87da6194d2a6f74ac2dd5"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "1b644bb927cfb28e3a23b28ad625279749d859e5"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From 3e35fee6522c4ad7e23da83a8915fde74455bebf Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Tue, 14 Apr 2026 16:09:27 +0300 Subject: [PATCH 026/100] add elastic8 --- .github/workflows/test-build.yml | 19 +++++++++++++++++++ osf_tests/metrics/test_es8_metrics.py | 6 ++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 09fbbb5b319..0f2e101c408 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -158,6 +158,14 @@ jobs: permissions: checks: write services: + elasticsearch8: + image: elasticsearch:8.19.11 + ports: + - 9202:9200 + env: + xpack.security.enabled: false + node.name: singlenode + cluster.initial_master_nodes: singlenode postgres: image: postgres env: @@ -173,8 +181,19 @@ jobs: steps: - uses: actions/checkout@v2 - uses: ./.github/actions/start-build + - name: Wait for Elasticsearch + run: | + echo "Waiting for ES8 health..." + sleep 5 + until curl -sf http://localhost:9202/_cluster/health?wait_for_status=yellow; do + echo "ES8 not ready yet..." + sleep 5 + done + echo "ES8 started successfully!" - name: Run tests run: poetry run python3 -m invoke test-ci-api2 --junit + env: + ELASTIC8_URI: http://localhost:9202 - name: Upload report if: (success() || failure()) # run this step even if previous step failed uses: ./.github/actions/gen-report diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py index 28dedd01eb4..07705825f86 100644 --- a/osf_tests/metrics/test_es8_metrics.py +++ b/osf_tests/metrics/test_es8_metrics.py @@ -13,14 +13,15 @@ def test_import_all_reports(self): assert True def test_instantiate_of_reports(self): - download_report = Es8DownloadCountReport() + download_report = Es8DownloadCountReport(cycle_coverage='2026-01-01') assert hasattr(download_report, 'daily_file_downloads') - user_report = Es8UserSummaryReport() + user_report = Es8UserSummaryReport(cycle_coverage='2026-01-01') assert hasattr(user_report, 'active') def test_nested_pageview(self): usage = OsfCountedUsageRecord( + cycle_coverage='2026-01-01', pageview_info={ 'page_url': 'https://example.com', 'referer_url': 'https://google.com', @@ -30,6 +31,7 @@ def test_nested_pageview(self): def test_pageview_info_autofill(self): obj = PageviewInfo( + cycle_coverage='2026-01-01', page_url='https://example.com/path/test', referer_url='https://google.com', timestamp=datetime(2024, 1, 1, 15, 0), From a2363420c43c40a55b69630a096fb549cc49a71a Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Tue, 14 Apr 2026 16:46:59 +0300 Subject: [PATCH 027/100] fix test --- osf_tests/metrics/test_es8_metrics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py index 07705825f86..d6b3d4c4434 100644 --- a/osf_tests/metrics/test_es8_metrics.py +++ b/osf_tests/metrics/test_es8_metrics.py @@ -13,15 +13,15 @@ def test_import_all_reports(self): assert True def test_instantiate_of_reports(self): - download_report = Es8DownloadCountReport(cycle_coverage='2026-01-01') + download_report = Es8DownloadCountReport(cycle_coverage='2026.01.01') assert hasattr(download_report, 'daily_file_downloads') - user_report = Es8UserSummaryReport(cycle_coverage='2026-01-01') + user_report = Es8UserSummaryReport(cycle_coverage='2026.01.01') assert hasattr(user_report, 'active') def test_nested_pageview(self): usage = OsfCountedUsageRecord( - cycle_coverage='2026-01-01', + cycle_coverage='2026.01.01', pageview_info={ 'page_url': 'https://example.com', 'referer_url': 'https://google.com', @@ -31,7 +31,7 @@ def test_nested_pageview(self): def test_pageview_info_autofill(self): obj = PageviewInfo( - cycle_coverage='2026-01-01', + cycle_coverage='2026.01.01', page_url='https://example.com/path/test', referer_url='https://google.com', timestamp=datetime(2024, 1, 1, 15, 0), From 00b055b5a13db955fdf1eab1a558f2bc5b64f33f Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 14 Apr 2026 12:36:35 -0400 Subject: [PATCH 028/100] timedepth constants --- osf/metrics/es8_metrics.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 020a9c72c80..85c9141fba8 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -1,8 +1,10 @@ import datetime import enum +from urllib.parse import urlsplit + import elasticsearch8.dsl as esdsl +from elasticsearch_metrics import DAILY, MONTHLY import elasticsearch_metrics.imps.elastic8 as djelme -from urllib.parse import urlsplit from osf.metrics.utils import YearMonth @@ -183,19 +185,19 @@ class UsageByStorageAddon(esdsl.InnerDoc): class Es8StorageAddonUsage(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = 3 + CYCLE_TIMEDEPTH = DAILY usage_by_addon: list[UsageByStorageAddon] class Es8DownloadCountReport(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = 3 + CYCLE_TIMEDEPTH = DAILY daily_file_downloads: int class Es8InstitutionSummaryReport(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = 3 + CYCLE_TIMEDEPTH = DAILY UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id',) institution_id: str @@ -208,7 +210,7 @@ class Es8InstitutionSummaryReport(djelme.CyclicRecord): class Es8NewUserDomainReport(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = 3 + CYCLE_TIMEDEPTH = DAILY UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'domain_name',) domain_name: str @@ -216,7 +218,7 @@ class Es8NewUserDomainReport(djelme.CyclicRecord): class Es8NodeSummaryReport(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = 3 + CYCLE_TIMEDEPTH = DAILY nodes: NodeRunningTotals projects: NodeRunningTotals @@ -225,13 +227,13 @@ class Es8NodeSummaryReport(djelme.CyclicRecord): class Es8OsfstorageFileCountReport(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = 3 + CYCLE_TIMEDEPTH = DAILY files: FileRunningTotals class Es8PreprintSummaryReport(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = 3 + CYCLE_TIMEDEPTH = DAILY UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'provider_key',) provider_key: str @@ -239,7 +241,7 @@ class Es8PreprintSummaryReport(djelme.CyclicRecord): class Es8UserSummaryReport(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = 3 + CYCLE_TIMEDEPTH = DAILY active: int deactivated: int @@ -250,7 +252,7 @@ class Es8UserSummaryReport(djelme.CyclicRecord): class Es8SpamSummaryReport(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = 2 + CYCLE_TIMEDEPTH = MONTHLY node_confirmed_spam: int node_confirmed_ham: int @@ -266,7 +268,7 @@ class Es8SpamSummaryReport(djelme.CyclicRecord): class Es8InstitutionalUserReport(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = 2 + CYCLE_TIMEDEPTH = MONTHLY UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', 'user_id',) institution_id: str @@ -289,7 +291,7 @@ class Es8InstitutionalUserReport(djelme.CyclicRecord): class Es8InstitutionMonthlySummaryReport(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = 2 + CYCLE_TIMEDEPTH = MONTHLY UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', ) institution_id: str @@ -306,7 +308,7 @@ class Es8InstitutionMonthlySummaryReport(djelme.CyclicRecord): class Es8PublicItemUsageReport(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = 2 + CYCLE_TIMEDEPTH = MONTHLY UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'item_osfid') # where noted, fields are meant to correspond to defined terms from COUNTER @@ -331,7 +333,7 @@ class Es8PublicItemUsageReport(djelme.CyclicRecord): class Es8PrivateSpamMetricsReport(djelme.CyclicRecord): - CYCLE_TIMEDEPTH = 2 + CYCLE_TIMEDEPTH = MONTHLY node_oopspam_flagged: int node_oopspam_hammed: int From dddc94e791d7de76f487d1a00ac767848279ce87 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 14 Apr 2026 12:36:14 -0400 Subject: [PATCH 029/100] tidy gh actions with yaml anchors, health checks --- .github/workflows/test-build.yml | 131 ++++--------------------------- 1 file changed, 17 insertions(+), 114 deletions(-) diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 0f2e101c408..3433e689a42 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -37,15 +37,19 @@ jobs: permissions: checks: write services: - elasticsearch8: - image: elasticsearch:8.19.11 + elasticsearch8: &ES8_SERVICE + image: elasticsearch:8.19.14 ports: - 9202:9200 env: + discovery.type: single-node xpack.security.enabled: false - node.name: singlenode - cluster.initial_master_nodes: singlenode - postgres: + options: >- + --health-cmd "curl -sf http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=30s" + --health-interval 10s + --health-timeout 30s + --health-retries 5 + postgres: &POSTGRES_SERVICE image: postgres env: POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }} @@ -60,15 +64,6 @@ jobs: steps: - uses: actions/checkout@v2 - uses: ./.github/actions/start-build - - name: Wait for Elasticsearch - run: | - echo "Waiting for ES8 health..." - sleep 5 - until curl -sf http://localhost:9202/_cluster/health?wait_for_status=yellow; do - echo "ES8 not ready yet..." - sleep 5 - done - echo "ES8 started successfully!" - name: Run tests run: poetry run python3 -m invoke test-ci-addons --junit env: @@ -83,18 +78,7 @@ jobs: permissions: checks: write services: - postgres: - image: postgres - env: - POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }} - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - # Maps tcp port 5432 on service container to the host - - 5432:5432 + postgres: *POSTGRES_SERVICE steps: - uses: actions/checkout@v2 - uses: ./.github/actions/start-build @@ -110,40 +94,13 @@ jobs: permissions: checks: write services: - elasticsearch8: - image: elasticsearch:8.19.11 - ports: - - 9202:9200 - env: - xpack.security.enabled: false - node.name: singlenode - cluster.initial_master_nodes: singlenode - postgres: - image: postgres - env: - POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }} - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - # Maps tcp port 5432 on service container to the host - - 5432:5432 + elasticsearch8: *ES8_SERVICE + postgres: *POSTGRES_SERVICE steps: - uses: actions/checkout@v2 - uses: ./.github/actions/start-build - name: NVM & yarn install run: poetry run python3 -m invoke assets --dev - - name: Wait for Elasticsearch - run: | - echo "Waiting for ES8 health..." - sleep 5 - until curl -sf http://localhost:9202/_cluster/health?wait_for_status=yellow; do - echo "ES8 not ready yet..." - sleep 5 - done - echo "ES8 started successfully!" - name: Run test run: poetry run python3 -m invoke test-ci-api1-and-js --junit env: @@ -158,26 +115,8 @@ jobs: permissions: checks: write services: - elasticsearch8: - image: elasticsearch:8.19.11 - ports: - - 9202:9200 - env: - xpack.security.enabled: false - node.name: singlenode - cluster.initial_master_nodes: singlenode - postgres: - image: postgres - env: - POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }} - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - # Maps tcp port 5432 on service container to the host - - 5432:5432 + elasticsearch8: *ES8_SERVICE + postgres: *POSTGRES_SERVICE steps: - uses: actions/checkout@v2 - uses: ./.github/actions/start-build @@ -204,19 +143,7 @@ jobs: checks: write needs: build-cache services: - postgres: - image: postgres - - env: - POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }} - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - # Maps tcp port 5432 on service container to the host - - 5432:5432 + postgres: *POSTGRES_SERVICE steps: - uses: actions/checkout@v2 - uses: ./.github/actions/start-build @@ -232,19 +159,7 @@ jobs: checks: write needs: build-cache services: - postgres: - image: postgres - - env: - POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }} - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - # Maps tcp port 5432 on service container to the host - - 5432:5432 + postgres: *POSTGRES_SERVICE mailhog: image: mailhog/mailhog ports: @@ -265,19 +180,7 @@ jobs: checks: write needs: build-cache services: - postgres: - image: postgres - - env: - POSTGRES_PASSWORD: ${{ env.OSF_DB_PASSWORD }} - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - # Maps tcp port 5432 on service container to the host - - 5432:5432 + postgres: *POSTGRES_SERVICE steps: - uses: actions/checkout@v2 - uses: ./.github/actions/start-build From 46a934f901b1c685aa33b9a34ff204b370abcdf5 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 14 Apr 2026 13:59:10 -0400 Subject: [PATCH 030/100] simplify local elasticsearch8 config --- docker-compose.yml | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 09aedd58247..83e8fd27483 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -79,26 +79,22 @@ services: stdin_open: true elasticsearch8: - image: docker.elastic.co/elasticsearch/elasticsearch:8.19.11 - platform: linux/arm64 + image: elasticsearch:8.19.14 environment: - - ELASTIC_PASSWORD=secretsecret - - node.name=singlenode - discovery.type=single-node - - xpack.security.enabled=true - - xpack.security.http.ssl.enabled=true - - xpack.security.http.ssl.key=/elastic8_certs/singlenode/singlenode.key - - xpack.security.http.ssl.certificate=/elastic8_certs/singlenode/singlenode.crt - - xpack.security.http.ssl.certificate_authorities=/elastic8_certs/ca/ca.crt - - xpack.security.transport.ssl.enabled=true - - xpack.security.transport.ssl.key=/elastic8_certs/singlenode/singlenode.key - - xpack.security.transport.ssl.certificate=/elastic8_certs/singlenode/singlenode.crt - - xpack.security.transport.ssl.certificate_authorities=/elastic8_certs/ca/ca.crt - - xpack.security.transport.ssl.verification_mode=certificate + - xpack.security.enabled=false + - ES_JAVA_OPTS=-Xms512m -Xmx512m # reduce memory usage + - xpack.ml.enabled=false ports: - 9202:9200 volumes: - elasticsearch8_data_vol:/usr/share/elasticsearch/data + healthcheck: + start_period: 15s + test: ["CMD", "curl", "-sf", "http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=30s"] + interval: 10s + timeout: 30s + retries: 5 stdin_open: true postgres: From 49f925945a2ea913dd56755fb9ac1d9efb905eb4 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 14 Apr 2026 15:57:53 -0400 Subject: [PATCH 031/100] bump djelme to get fixes --- poetry.lock | 6 +++--- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index 5bbe2ae1f49..90665bce81f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "1b644bb927cfb28e3a23b28ad625279749d859e5" -resolved_reference = "1b644bb927cfb28e3a23b28ad625279749d859e5" +reference = "8025d58e23b4e0c562e1d59c98b10ec936eb56e6" +resolved_reference = "8025d58e23b4e0c562e1d59c98b10ec936eb56e6" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "3a5ea0758a65dac062ba307a2f29bdb8d637c2b0a78a2f68fea86c39516c6922" +content-hash = "ef1d6d327f5557e43482793b276ccb6c5fd07989f27367af3a3736a8547b4d1a" diff --git a/pyproject.toml b/pyproject.toml index 375b8cacd25..013df3f448d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "1b644bb927cfb28e3a23b28ad625279749d859e5"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "8025d58e23b4e0c562e1d59c98b10ec936eb56e6"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From 29839b975f440d1bbbe962d7e1ee0fce813e16c5 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 14 Apr 2026 17:03:45 -0400 Subject: [PATCH 032/100] tests passing with djelme es8 - use elasticsearch_metrics.test.util - move "Es8" prefix to suffix - autofill fields on `save`, not `__init__` (to work with how esdsl loads search results) --- conftest.py | 22 +++----- osf/metrics/__init__.py | 18 +------ osf/metrics/es8_metrics.py | 57 +++++++++++---------- osf/metrics/reports.py | 2 + osf_tests/metrics/test_es8_metrics.py | 74 +++++++++++++++------------ 5 files changed, 82 insertions(+), 91 deletions(-) diff --git a/conftest.py b/conftest.py index 232b788c0fb..6eafa1b7a55 100644 --- a/conftest.py +++ b/conftest.py @@ -6,7 +6,7 @@ from django.db import transaction from elasticsearch6_dsl.connections import connections from website import settings as osf_settings -from elasticsearch_metrics.tests._test_util import RealElasticTestCase +from elasticsearch_metrics.tests.util import djelme_test_backends from faker import Factory import pytest import responses @@ -146,19 +146,9 @@ def _es_metrics_marker(request): yield return - connections.create_connection( - alias='osfmetrics_es6', - hosts=osf_settings.ELASTIC6_URI, - ) - - class _Es6TestCase(RealElasticTestCase, autosetup_djelme_backends=True): - ... - es6_test_case = _Es6TestCase() - es6_test_case.setUp() - try: + with djelme_test_backends(): yield - finally: - es6_test_case.tearDown() + @pytest.fixture def mock_share_responses(): @@ -356,6 +346,6 @@ def mock_gravy_valet_get_verified_links(): yield mock_get_verified_links -@pytest.fixture(autouse=True) -def load_notification_types(db, *args, **kwargs): - populate_notification_types(*args, **kwargs) +# @pytest.fixture(autouse=True) +# def load_notification_types(db, *args, **kwargs): +# populate_notification_types(*args, **kwargs) diff --git a/osf/metrics/__init__.py b/osf/metrics/__init__.py index 6cef14f5cf9..6056e6d92f3 100644 --- a/osf/metrics/__init__.py +++ b/osf/metrics/__init__.py @@ -17,16 +17,8 @@ StorageAddonUsage, UserSummaryReport, ) +from . import es8_metrics -from .es8_metrics import ( - Es8DownloadCountReport, - Es8UserSummaryReport, - Es8NodeSummaryReport, - Es8InstitutionSummaryReport, - Es8NewUserDomainReport, - Es8OsfstorageFileCountReport, - Es8StorageAddonUsage, -) DAILY_REPORTS = ( DownloadCountReport, @@ -37,13 +29,6 @@ PreprintSummaryReport, StorageAddonUsage, UserSummaryReport, - Es8DownloadCountReport, - Es8InstitutionSummaryReport, - Es8NewUserDomainReport, - Es8NodeSummaryReport, - Es8OsfstorageFileCountReport, - Es8StorageAddonUsage, - Es8UserSummaryReport ) @@ -53,4 +38,5 @@ 'PreprintView', 'PreprintDownload', 'RegistriesModerationMetrics', + 'es8_metrics', ) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 85c9141fba8..436a1c62d46 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -56,18 +56,6 @@ class PageviewInfo(esdsl.InnerDoc): for CountedAuthUsage generated by viewing a web page """ - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.page_path: str = '' - if self.page_url: - self.page_path = urlsplit(self.page_url).path.rstrip('/') - self.referer_domain: str = '' - if self.referer_url: - self.referer_domain = urlsplit(self.referer_url).netloc - self.hour_of_day: int = 0 - if self.timestamp: - self.hour_of_day = self.timestamp.hour - # fields that should be provided referer_url: str page_url: str @@ -78,10 +66,12 @@ def __init__(self, *args, **kwargs): }, )) + # fields auto-filled page_path: str referer_domain: str hour_of_day: int + ### # Event records @@ -103,6 +93,19 @@ class OsfCountedUsageRecord(djelme.CountedUsageRecord): action_labels: list[str] pageview_info: PageviewInfo + def save(self, *args, **kwargs): + # autofill pageview_info fields + if self.pageview_info: + self.pageview_info.hour_of_day = self.timestamp.hour + _url = self.pageview_info.page_url + if _url: + self.pageview_info.page_path = urlsplit(_url).path.rstrip('/') + _ref_url = self.pageview_info.referer_url + if _ref_url: + self.pageview_info.referer_domain = urlsplit(_ref_url).netloc + super().save(*args, **kwargs) + + class ActionLabel(enum.Enum): SEARCH = 'search' # counter:Search VIEW = 'view' # counter:Investigation @@ -111,7 +114,7 @@ class ActionLabel(enum.Enum): API = 'api' # counter:TDM (aka "non-web api usage") -class Es8RegistriesModerationMetrics(djelme.EventRecord): +class RegistriesModerationMetricsEs8(djelme.EventRecord): registration_id: str provider_id: str trigger: str @@ -184,19 +187,19 @@ class UsageByStorageAddon(esdsl.InnerDoc): # Cyclic reports -class Es8StorageAddonUsage(djelme.CyclicRecord): +class StorageAddonUsageEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY usage_by_addon: list[UsageByStorageAddon] -class Es8DownloadCountReport(djelme.CyclicRecord): +class DownloadCountReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY daily_file_downloads: int -class Es8InstitutionSummaryReport(djelme.CyclicRecord): +class InstitutionSummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id',) @@ -209,15 +212,15 @@ class Es8InstitutionSummaryReport(djelme.CyclicRecord): registered_projects: RegistrationRunningTotals -class Es8NewUserDomainReport(djelme.CyclicRecord): +class NewUserDomainReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'domain_name',) domain_name: str - domain_name: int + new_user_count: int -class Es8NodeSummaryReport(djelme.CyclicRecord): +class NodeSummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY nodes: NodeRunningTotals @@ -226,13 +229,13 @@ class Es8NodeSummaryReport(djelme.CyclicRecord): registered_projects: RegistrationRunningTotals -class Es8OsfstorageFileCountReport(djelme.CyclicRecord): +class OsfstorageFileCountReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY files: FileRunningTotals -class Es8PreprintSummaryReport(djelme.CyclicRecord): +class PreprintSummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'provider_key',) @@ -240,7 +243,7 @@ class Es8PreprintSummaryReport(djelme.CyclicRecord): preprint_count: int -class Es8UserSummaryReport(djelme.CyclicRecord): +class UserSummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY active: int @@ -251,7 +254,7 @@ class Es8UserSummaryReport(djelme.CyclicRecord): unconfirmed: int -class Es8SpamSummaryReport(djelme.CyclicRecord): +class SpamSummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = MONTHLY node_confirmed_spam: int @@ -267,7 +270,7 @@ class Es8SpamSummaryReport(djelme.CyclicRecord): user_marked_as_ham: int -class Es8InstitutionalUserReport(djelme.CyclicRecord): +class InstitutionalUserReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = MONTHLY UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', 'user_id',) @@ -290,7 +293,7 @@ class Es8InstitutionalUserReport(djelme.CyclicRecord): storage_byte_count: int = esdsl.mapped_field(esdsl.Long()) -class Es8InstitutionMonthlySummaryReport(djelme.CyclicRecord): +class InstitutionMonthlySummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = MONTHLY UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'institution_id', ) @@ -307,7 +310,7 @@ class Es8InstitutionMonthlySummaryReport(djelme.CyclicRecord): monthly_active_user_count: int = esdsl.mapped_field(esdsl.Long()) -class Es8PublicItemUsageReport(djelme.CyclicRecord): +class PublicItemUsageReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = MONTHLY UNIQUE_TOGETHER_FIELDS = ('cycle_coverage', 'item_osfid') @@ -332,7 +335,7 @@ class Es8PublicItemUsageReport(djelme.CyclicRecord): cumulative_download_session_count: int = esdsl.mapped_field(esdsl.Long()) -class Es8PrivateSpamMetricsReport(djelme.CyclicRecord): +class PrivateSpamMetricsReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = MONTHLY node_oopspam_flagged: int diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py index 9d71ea7e8c2..62479e359cd 100644 --- a/osf/metrics/reports.py +++ b/osf/metrics/reports.py @@ -120,6 +120,8 @@ def save(self, *args, **kwargs): @receiver(metrics_pre_save) def set_report_id(sender, instance, **kwargs): + if not issubclass(sender, metrics.Metric): + return # skip es8 record types try: _unique_together_fields = instance.UNIQUE_TOGETHER_FIELDS except AttributeError: diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py index d6b3d4c4434..68d767fca89 100644 --- a/osf_tests/metrics/test_es8_metrics.py +++ b/osf_tests/metrics/test_es8_metrics.py @@ -1,42 +1,52 @@ from datetime import datetime +from elasticsearch_metrics.tests.util import djelme_test_backends +import pytest + from osf.metrics.es8_metrics import ( - Es8DownloadCountReport, - Es8UserSummaryReport, + PageviewInfo, + DownloadCountReportEs8, OsfCountedUsageRecord, - PageviewInfo ) class TestEs8Metrics: - def test_import_all_reports(self): - assert True - - def test_instantiate_of_reports(self): - download_report = Es8DownloadCountReport(cycle_coverage='2026.01.01') - assert hasattr(download_report, 'daily_file_downloads') - - user_report = Es8UserSummaryReport(cycle_coverage='2026.01.01') - assert hasattr(user_report, 'active') - - def test_nested_pageview(self): - usage = OsfCountedUsageRecord( - cycle_coverage='2026.01.01', - pageview_info={ - 'page_url': 'https://example.com', - 'referer_url': 'https://google.com', - } - ) - assert usage.pageview_info is not None - - def test_pageview_info_autofill(self): - obj = PageviewInfo( - cycle_coverage='2026.01.01', - page_url='https://example.com/path/test', - referer_url='https://google.com', + """smoke tests to check that djelme records can be saved and searched""" + @pytest.fixture(autouse=True) + def _real_elastic(self): + with djelme_test_backends(): + yield + + def test_nested_pageview_autofill(self): + usage = OsfCountedUsageRecord.record( timestamp=datetime(2024, 1, 1, 15, 0), + sessionhour_id='blah', + database_iri='https://osf.example/provider', + item_iri='https://osf.example/itemm', + item_osfid='itemm', + item_public=True, + item_type='https://osf.example/Preprint', + platform_iri='https://osf.example', + user_is_authenticated=False, + pageview_info=PageviewInfo( + page_url="https://example.com/path/test", + referer_url="https://google.com", + route_name='foo.bar', + page_title='title title', + ), ) - - assert obj.page_path == '/path/test' - assert obj.referer_domain == 'google.com' - assert obj.hour_of_day == 15 + assert usage.pageview_info.page_path == "/path/test" + assert usage.pageview_info.referer_domain == "google.com" + assert usage.pageview_info.hour_of_day == 15 + + def test_save_report(self): + _saved = DownloadCountReportEs8.record( + cycle_coverage="2026.1.1", + daily_file_downloads=17, + ) + DownloadCountReportEs8.refresh_timeseries_indexes() + _response = DownloadCountReportEs8.search().execute() + (_fetched,) = _response + assert _fetched.meta.id == _saved.meta.id + assert _fetched.cycle_coverage == '2026.1.1' + assert _fetched.daily_file_downloads == 17 From 619cac7cca77df36e2d04f37dd55a060d36e4f75 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 14 Apr 2026 17:12:45 -0400 Subject: [PATCH 033/100] fix(test): patch check_index_template --- osf_tests/metrics/test_daily_report.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/osf_tests/metrics/test_daily_report.py b/osf_tests/metrics/test_daily_report.py index 9301cdb114f..5228e2342c5 100644 --- a/osf_tests/metrics/test_daily_report.py +++ b/osf_tests/metrics/test_daily_report.py @@ -10,8 +10,9 @@ class TestDailyReportKey: @pytest.fixture def mock_save(self): - with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: - yield mock_save + with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'): + with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: + yield mock_save def test_default(self, mock_save): # only one of this type of report per day From 8cec095a5b604a9f97abd4297af96774c7e585ac Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 14 Apr 2026 17:20:45 -0400 Subject: [PATCH 034/100] uncomment autouse fixture --- conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conftest.py b/conftest.py index 6eafa1b7a55..7adf6bdeba6 100644 --- a/conftest.py +++ b/conftest.py @@ -346,6 +346,6 @@ def mock_gravy_valet_get_verified_links(): yield mock_get_verified_links -# @pytest.fixture(autouse=True) -# def load_notification_types(db, *args, **kwargs): -# populate_notification_types(*args, **kwargs) +@pytest.fixture(autouse=True) +def load_notification_types(db, *args, **kwargs): + populate_notification_types(*args, **kwargs) From c24430fff7b2fdca860be0bb216bad20108a67ab Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 14 Apr 2026 17:25:56 -0400 Subject: [PATCH 035/100] remove unnecessary loop --- .github/workflows/test-build.yml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 3433e689a42..011b621cca9 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -120,15 +120,6 @@ jobs: steps: - uses: actions/checkout@v2 - uses: ./.github/actions/start-build - - name: Wait for Elasticsearch - run: | - echo "Waiting for ES8 health..." - sleep 5 - until curl -sf http://localhost:9202/_cluster/health?wait_for_status=yellow; do - echo "ES8 not ready yet..." - sleep 5 - done - echo "ES8 started successfully!" - name: Run tests run: poetry run python3 -m invoke test-ci-api2 --junit env: From cd3282786f5fc2c715f8bd0387903e6ba2d44d9a Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 14 Apr 2026 17:28:40 -0400 Subject: [PATCH 036/100] plac8 flake8 --- conftest.py | 1 - osf_tests/metrics/test_es8_metrics.py | 11 ++++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conftest.py b/conftest.py index 7adf6bdeba6..9d8861a1e97 100644 --- a/conftest.py +++ b/conftest.py @@ -5,7 +5,6 @@ from django.db import transaction from elasticsearch6_dsl.connections import connections -from website import settings as osf_settings from elasticsearch_metrics.tests.util import djelme_test_backends from faker import Factory import pytest diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py index 68d767fca89..e93579628dc 100644 --- a/osf_tests/metrics/test_es8_metrics.py +++ b/osf_tests/metrics/test_es8_metrics.py @@ -12,6 +12,7 @@ class TestEs8Metrics: """smoke tests to check that djelme records can be saved and searched""" + @pytest.fixture(autouse=True) def _real_elastic(self): with djelme_test_backends(): @@ -29,19 +30,19 @@ def test_nested_pageview_autofill(self): platform_iri='https://osf.example', user_is_authenticated=False, pageview_info=PageviewInfo( - page_url="https://example.com/path/test", - referer_url="https://google.com", + page_url='https://example.com/path/test', + referer_url='https://google.com', route_name='foo.bar', page_title='title title', ), ) - assert usage.pageview_info.page_path == "/path/test" - assert usage.pageview_info.referer_domain == "google.com" + assert usage.pageview_info.page_path == '/path/test' + assert usage.pageview_info.referer_domain == 'google.com' assert usage.pageview_info.hour_of_day == 15 def test_save_report(self): _saved = DownloadCountReportEs8.record( - cycle_coverage="2026.1.1", + cycle_coverage='2026.1.1', daily_file_downloads=17, ) DownloadCountReportEs8.refresh_timeseries_indexes() From db938be047d4df29e2d01ee18b923f9c681eaa35 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 14 Apr 2026 17:47:24 -0400 Subject: [PATCH 037/100] remove unused local env vars --- .docker-compose.env | 2 -- 1 file changed, 2 deletions(-) diff --git a/.docker-compose.env b/.docker-compose.env index 2542d16e841..80eebc8707b 100644 --- a/.docker-compose.env +++ b/.docker-compose.env @@ -8,9 +8,7 @@ API_DOMAIN=http://localhost:8000/ ELASTIC_URI=192.168.168.167:9200 ELASTIC6_URI=192.168.168.167:9201 ELASTIC8_URI=http://192.168.168.167:9202 -ELASTIC8_CERT_PATH=/elastic8_certs/ca/ca.crt ELASTIC8_USERNAME=elastic -ELASTIC8_SECRET=secretsecret OSF_DB_HOST=192.168.168.167 DB_HOST=192.168.168.167 REDIS_HOST=redis://192.168.168.167:6379 From 52a2bc94935057d874e9fc3cdf28b6f5d0e9e684 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 14 Apr 2026 18:16:12 -0400 Subject: [PATCH 038/100] better use waffle switch ELASTICSEARCH_METRICS --- .../test_registries_moderation_metrics.py | 12 ------------ conftest.py | 18 ++++++++++++------ osf/models/registrations.py | 5 ++++- osf_tests/metrics/test_monthly_report.py | 5 +++-- 4 files changed, 19 insertions(+), 21 deletions(-) diff --git a/api_tests/metrics/test_registries_moderation_metrics.py b/api_tests/metrics/test_registries_moderation_metrics.py index 0f3dddb79b6..f5d3a047b10 100644 --- a/api_tests/metrics/test_registries_moderation_metrics.py +++ b/api_tests/metrics/test_registries_moderation_metrics.py @@ -1,7 +1,5 @@ import pytest -from waffle.testutils import override_switch -from osf import features from osf_tests.factories import RegistrationFactory, AuthUserFactory from osf.utils.workflows import RegistrationModerationStates, RegistrationModerationTriggers from osf.metrics import RegistriesModerationMetrics @@ -17,11 +15,6 @@ class TestRegistrationModerationMetrics: def registration(self): return RegistrationFactory() - @pytest.fixture(autouse=True) - def enable_elasticsearch_metrics(self): - with override_switch(features.ELASTICSEARCH_METRICS, active=True): - yield - @pytest.mark.es_metrics def test_record_transitions(self, registration): with capture_notifications(): @@ -50,11 +43,6 @@ class TestRegistrationModerationMetricsView: def registration(self): return RegistrationFactory() - @pytest.fixture(autouse=True) - def enable_elasticsearch_metrics(self): - with override_switch(features.ELASTICSEARCH_METRICS, active=True): - yield - @pytest.fixture def user(self): user = AuthUserFactory() diff --git a/conftest.py b/conftest.py index 9d8861a1e97..0c944957661 100644 --- a/conftest.py +++ b/conftest.py @@ -10,12 +10,15 @@ import pytest import responses import xml.etree.ElementTree as ET +from waffle.testutils import override_switch from api_tests.share import _utils as shtrove_test_utils from framework.celery_tasks import app as celery_app from osf.external.spam import tasks as spam_tasks from website import settings as website_settings from osf.management.commands.populate_notification_types import populate_notification_types +from osf import features + def pytest_configure(config): if not os.getenv('GITHUB_ACTIONS') == 'true': @@ -141,12 +144,15 @@ def _es_metrics_marker(request): """ marker = request.node.get_closest_marker('es_metrics') - if not marker: - yield - return - - with djelme_test_backends(): - yield + if marker: + with ( + override_switch(features.ELASTICSEARCH_METRICS, active=True), + djelme_test_backends(), + ): + yield + else: + with override_switch(features.ELASTICSEARCH_METRICS, active=False): + yield @pytest.fixture diff --git a/osf/models/registrations.py b/osf/models/registrations.py index e1d819b43bf..e9114355649 100644 --- a/osf/models/registrations.py +++ b/osf/models/registrations.py @@ -14,9 +14,11 @@ UserObjectPermissionBase, ) from dirtyfields import DirtyFieldsMixin +import waffle from framework.auth import Auth from framework.exceptions import PermissionsError +from osf import features from osf.models import Identifier from osf.utils.fields import NonNaiveDateTimeField, LowercaseCharField from osf.utils.permissions import ADMIN, READ, WRITE @@ -782,7 +784,8 @@ def _write_registration_action(self, from_state, to_state, initiated_by, comment comment=comment ) action.save() - RegistriesModerationMetrics.record_transitions(action) + if waffle.switch_is_active(features.ELASTICSEARCH_METRICS): + RegistriesModerationMetrics.record_transitions(action) moderation_notifications = { RegistrationModerationTriggers.SUBMIT: notify.notify_submit, diff --git a/osf_tests/metrics/test_monthly_report.py b/osf_tests/metrics/test_monthly_report.py index cc8c4137cb2..9d0980cd5b8 100644 --- a/osf_tests/metrics/test_monthly_report.py +++ b/osf_tests/metrics/test_monthly_report.py @@ -11,8 +11,9 @@ class TestMonthlyReportKey: @pytest.fixture def mock_save(self): - with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: - yield mock_save + with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'): + with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: + yield mock_save def test_default(self, mock_save): # only one of this type of report per month From 82de65b8ed8c2eb20e30fcb09eb139e40e7cbcd9 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 14 Apr 2026 18:18:46 -0400 Subject: [PATCH 039/100] mock check mock save --- api_tests/metrics/test_counted_usage.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/api_tests/metrics/test_counted_usage.py b/api_tests/metrics/test_counted_usage.py index 568d663be9e..e2cb7040037 100644 --- a/api_tests/metrics/test_counted_usage.py +++ b/api_tests/metrics/test_counted_usage.py @@ -38,8 +38,9 @@ def assert_saved_with(mock_save, *, expected_doc_id=None, expected_attrs): @pytest.fixture def mock_save(): - with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: - yield mock_save + with mock.patch('elasticsearch_metrics.imps.elastic6.BaseMetric.check_index_template'): + with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: + yield mock_save @pytest.mark.django_db From b33280df27eee0082eebcf9f037b9eea62e0df07 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Wed, 15 Apr 2026 15:57:03 +0300 Subject: [PATCH 040/100] remove the override --- conftest.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/conftest.py b/conftest.py index 0c944957661..198316f1cc4 100644 --- a/conftest.py +++ b/conftest.py @@ -144,15 +144,15 @@ def _es_metrics_marker(request): """ marker = request.node.get_closest_marker('es_metrics') - if marker: - with ( - override_switch(features.ELASTICSEARCH_METRICS, active=True), - djelme_test_backends(), - ): - yield - else: - with override_switch(features.ELASTICSEARCH_METRICS, active=False): - yield + if not marker: + yield + return + + with ( + override_switch(features.ELASTICSEARCH_METRICS, active=True), + djelme_test_backends(), + ): + yield @pytest.fixture From 1cef7d335c8a00677f6e37ddb975bd14619e02d6 Mon Sep 17 00:00:00 2001 From: Bohdan Odintsov Date: Wed, 15 Apr 2026 16:55:34 +0300 Subject: [PATCH 041/100] fix failing test --- osf_tests/metrics/test_monthly_report.py | 1 + 1 file changed, 1 insertion(+) diff --git a/osf_tests/metrics/test_monthly_report.py b/osf_tests/metrics/test_monthly_report.py index 9d0980cd5b8..ba981e997d6 100644 --- a/osf_tests/metrics/test_monthly_report.py +++ b/osf_tests/metrics/test_monthly_report.py @@ -80,6 +80,7 @@ class Meta: @pytest.mark.es_metrics +@pytest.mark.django_db class TestLastMonthReport: @pytest.fixture def osfid(self): From 029647f4c70581bbd4a47fc1dbf266a020fccb00 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 9 Apr 2026 09:40:37 -0400 Subject: [PATCH 042/100] add background_migration queue (in the osf way) --- framework/celery_tasks/routers.py | 2 ++ website/settings/defaults.py | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/framework/celery_tasks/routers.py b/framework/celery_tasks/routers.py index c33238780e8..d9d6e335286 100644 --- a/framework/celery_tasks/routers.py +++ b/framework/celery_tasks/routers.py @@ -11,6 +11,8 @@ def match_by_module(task_path): return CeleryConfig.task_med_queue if task_subpath in CeleryConfig.high_pri_modules: return CeleryConfig.task_high_queue + if task_subpath in CeleryConfig.background_migration_modules: + return CeleryConfig.task_background_migration_queue if task_subpath in CeleryConfig.remote_computing_modules: return CeleryConfig.task_remote_computing_queue if task_subpath in CeleryConfig.account_status_changes_modules: diff --git a/website/settings/defaults.py b/website/settings/defaults.py index 1e8032cc95c..3053f9d1075 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -421,6 +421,7 @@ class CeleryConfig: task_account_status_changes_queue = 'account_status_changes' task_external_high_queue = 'external_high' task_external_low_queue = 'external_low' + task_background_migration_queue = 'background_migration' external_high_modules = { 'osf.tasks.log_gv_addon', @@ -487,6 +488,10 @@ class CeleryConfig: 'api.share.utils', } + background_migration_modules = { + 'osf.management.commands.metrics_es8_migration', + } + try: from kombu import Queue, Exchange except ImportError: @@ -540,6 +545,12 @@ class CeleryConfig: routing_key=task_external_low_queue, consumer_arguments={'x-priority': -2}, ), + Queue( + task_background_migration_queue, + Exchange(task_background_migration_queue), + routing_key=task_background_migration_queue, + consumer_arguments={'x-priority': -1}, + ), ) task_default_exchange_type = 'direct' From ac397e8c509df085257ba214fa621fc5b61e8c13 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 14 Apr 2026 11:30:17 -0400 Subject: [PATCH 043/100] wip --- .../commands/metrics_es8_migration.py | 184 ++++++++++++++++++ osf/management/commands/sync_databases.py | 2 +- 2 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 osf/management/commands/metrics_es8_migration.py diff --git a/osf/management/commands/metrics_es8_migration.py b/osf/management/commands/metrics_es8_migration.py new file mode 100644 index 00000000000..46b187c63bf --- /dev/null +++ b/osf/management/commands/metrics_es8_migration.py @@ -0,0 +1,184 @@ +import datetime +import logging + + +from django.core.management import call_command +from django.core.management.base import BaseCommand +from elasticsearch6 import helpers as es6_helpers +from elasticsearch8 import helpers as es8_helpers +from elasticsearch_metrics.registry import djelme_registry +from elasticsearch_metrics.imps import elastic8 as djel8me +from elasticsearch_metrics.util.timeparts import format_timeparts + +from framework.celery_tasks import app as celery_app +from osf.metrics import reports as es6_reports +from osf.metrics import es8_metrics, RegistriesModerationMetrics + + +_logger = logging.getLogger(__name__) + +_UNCHANGED_RECORDTYPES = { + # reports + es6_reports.StorageAddonUsage: es8_metrics.StorageAddonUsageEs8, + es6_reports.DownloadCountReport: es8_metrics.DownloadCountReportEs8, + es6_reports.InstitutionSummaryReport: es8_metrics.InstitutionSummaryReportEs8, + es6_reports.NewUserDomainReport: es8_metrics.NewUserDomainReportEs8, + es6_reports.NodeSummaryReport: es8_metrics.NodeSummaryReportEs8, + es6_reports.OsfstorageFileCountReport: es8_metrics.OsfstorageFileCountReportEs8, + es6_reports.PreprintSummaryReport: es8_metrics.PreprintSummaryReportEs8, + es6_reports.UserSummaryReport: es8_metrics.UserSummaryReportEs8, + es6_reports.SpamSummaryReport: es8_metrics.SpamSummaryReportEs8, + es6_reports.InstitutionalUserReport: es8_metrics.InstitutionalUserReportEs8, + es6_reports.InstitutionMonthlySummaryReport: es8_metrics.InstitutionMonthlySummaryReportEs8, + es6_reports.PrivateSpamMetricsReport: es8_metrics.PrivateSpamMetricsReportEs8, + # events + RegistriesModerationMetrics: es8_metrics.RegistriesModerationMetricsEs8, +} + + +def _debug_migrate(es8_client, each_new): + for _each in each_new: + print(_each) + + +def _do_migrate(es8_client, each_new): + es8_helpers.bulk(es8_client, each_new, ..., stats_only=True) + + +def _es6_scan(es6_recordtype, from_when: str, until_when: str): + return es6_helpers.scan( + es6_client, + index=es6_recordtype._template_pattern, + query={"range": {"timestamp": {"gte": from_when, "lt": until_when}}}, + ) + + +def _cycle_coverage_daily(report_date): ... + + +def _cycle_coverage_monthly(report_yearmonth): ... + + +def _unchanged_report_kwargs(es6_recordtype, hit): + if issubclass(es6_recordtype, es6_reports.DailyReport): + _cycle_coverage = format_timeparts( + datetime.date.fromisoformat(hit.pop("report_date")), djel8me.DAILY + ) + elif issubclass(es6_recordtype, es6_reports.MonthlyReport): + _cycle_coverage = format_timeparts(hit.pop("report_yearmonth"), djel8me.MONTHLY) + return { + **hit, + 'cycle_coverage': _cycle_coverage, + } + + +@celery_app.task +def migrate_unchanged_recordtype( + es6_recordtype_name: str, +): + _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) + _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] + + def _each_new(): + for _hit in _es6_scan(_es6_recordtype, from_when, until_when): + breakpoint() + yield _es8_recordtype.record( + ..., + using=False, # saved in bulk + ) + + _debug_migrate(_each_new()) + # _do_migrate(_each_new()) + + +@celery_app.task +def migrate_preprint_views(from_date, until_date): + # convert to counted-usage + ... + + +@celery_app.task +def migrate_preprint_downloads(from_date, until_date): + # convert to counted-usage + ... + + +@celery_app.task +def migrate_usage_reports(from_date, until_date): + # from PublicItemUsageReport to PublicItemUsageReportEs8 + # add cumulative count + ... + + +class Command(BaseCommand): + def add_arguments(self, parser): + parser.add_argument( + "--start", + action="store_true", + ) + parser.add_argument( + "--unchanged", + action="store_true", + ) + parser.add_argument( + "--usage-events", + action="store_true", + ) + parser.add_argument( + "--usage-reports", + action="store_true", + ) + + def handle(self, *, start, unchanged, usage_events, usage_reports, **kwargs): + call_command('djelme_backend_setup') # ensure all index templates + _default_all = not any((unchanged, usage_events, usage_reports)) + + if unchanged or _default_all: + self._handle_unchanged(start=start) + if usage_events or _default_all: + self._handle_usage_events(start=start) + if usage_reports or _default_all: + self._handle_usage_reports(start=start) + + def _handle_unchanged(self, *, start: bool): + # for each (unchanged) report/event: + for _es6_cls, _es8_cls in _UNCHANGED_RECORDTYPES.items(): + _es6_count = _es6_cls.search().count() + _es8_count = _es8_cls.search().count() + _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) + self.stdout.write(f'{_es6_cls.__name__} (es6):\t{_es6_count}') + self.stdout.write(f'{_es8_cls.__name__}:\t{_style(_es8_count)}') + if start: + self.stdout.write(f'starting {_es6_cls.__name__} => {_es8_cls.__name__}') + # TODO: migrate_unchanged_recordtype.apply_async(...) + self.stdout.write('---') + + def _handle_usage_events(self, *, start: bool): + # for counted-usage events: + # TODO: last X months only + # get/compare/print cardinalities + # schedule (per-day?) tasks (if --start) + _es6_pview_count = PreprintView.search().count() + _es6_pdownload_count = PreprintDownload.search().count() + _es6_usage_event_count = CountedAuthUsage.search().count() + _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count + _es8_count = OsfCountedUsageEvent.search().count() + _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) + self.stdout.write(f'{PreprintView.__name__} (es6):\t{_es6_pview_count}') + self.stdout.write(f'{PreprintDownload.__name__} (es6):\t{_es6_pdownload_count}') + self.stdout.write(f'{CountedAuthUsage.__name__} (es6):\t{_es6_pdownload_count}') + self.stdout.write(f'total (es6):\t{_es6_count}') + self.stdout.write(f'{OsfCountedUsageEvent.__name__}:\t{_style(_es8_count)}') + if start: + self.stdout.write(f'starting {_es6_cls.__name__} => {_es8_cls.__name__}') + # TODO: migrate_usage_events.apply_async(...) + + def _handle_usage_reports(self, *, start: bool): + _es6_count = PublicItemUsageReport.search().count() + _es8_count = PublicItemUsageReportEs8.search().count() + _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) + self.stdout.write(f'{PublicItemUsageReport.__name__} (es6):\t{_es6_count}') + self.stdout.write(f'{PublicItemUsageReportEs8.__name__}:\t{_style(_es8_count)}') + _item_count + # (if --start) schedule task per item (by composite agg on es6 public usage reports) + # each item-task iter thru reports oldest to newest, adding cumulative counts diff --git a/osf/management/commands/sync_databases.py b/osf/management/commands/sync_databases.py index c31d63ea16e..b5030b4bba7 100644 --- a/osf/management/commands/sync_databases.py +++ b/osf/management/commands/sync_databases.py @@ -20,7 +20,7 @@ def handle(self, *args, **options): ['migrate'], ] if waffle.switch_is_active(features.ELASTICSEARCH_METRICS): - COMMANDS.append(['sync_metrics']) + COMMANDS.append(['djelme_backend_setup']) for check in COMMANDS: call_command(*check) From ef981e7c886b67806d60c1d37261dde4cd6e1e8d Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 15 Apr 2026 08:25:11 -0400 Subject: [PATCH 044/100] wip --- .../commands/metrics_es8_migration.py | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/osf/management/commands/metrics_es8_migration.py b/osf/management/commands/metrics_es8_migration.py index 46b187c63bf..c2f765eef72 100644 --- a/osf/management/commands/metrics_es8_migration.py +++ b/osf/management/commands/metrics_es8_migration.py @@ -11,6 +11,11 @@ from elasticsearch_metrics.util.timeparts import format_timeparts from framework.celery_tasks import app as celery_app +from osf.metrics.preprint_metrics import ( + PreprintView as PreprintViewEs6, + PreprintDownload as PreprintDownloadEs6, +) +from osf.metrics.counted_usage import CountedAuthUsage as CountedUsageEs6 from osf.metrics import reports as es6_reports from osf.metrics import es8_metrics, RegistriesModerationMetrics @@ -132,7 +137,6 @@ def add_arguments(self, parser): def handle(self, *, start, unchanged, usage_events, usage_reports, **kwargs): call_command('djelme_backend_setup') # ensure all index templates _default_all = not any((unchanged, usage_events, usage_reports)) - if unchanged or _default_all: self._handle_unchanged(start=start) if usage_events or _default_all: @@ -143,12 +147,13 @@ def handle(self, *, start, unchanged, usage_events, usage_reports, **kwargs): def _handle_unchanged(self, *, start: bool): # for each (unchanged) report/event: for _es6_cls, _es8_cls in _UNCHANGED_RECORDTYPES.items(): + # display counts _es6_count = _es6_cls.search().count() _es8_count = _es8_cls.search().count() _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) self.stdout.write(f'{_es6_cls.__name__} (es6):\t{_es6_count}') self.stdout.write(f'{_es8_cls.__name__}:\t{_style(_es8_count)}') - if start: + if start: # schedule task self.stdout.write(f'starting {_es6_cls.__name__} => {_es8_cls.__name__}') # TODO: migrate_unchanged_recordtype.apply_async(...) self.stdout.write('---') @@ -156,29 +161,30 @@ def _handle_unchanged(self, *, start: bool): def _handle_usage_events(self, *, start: bool): # for counted-usage events: # TODO: last X months only - # get/compare/print cardinalities - # schedule (per-day?) tasks (if --start) - _es6_pview_count = PreprintView.search().count() - _es6_pdownload_count = PreprintDownload.search().count() - _es6_usage_event_count = CountedAuthUsage.search().count() + # display counts for each view/download event type + _es6_pview_count = PreprintViewEs6.search().count() + _es6_pdownload_count = PreprintDownloadEs6.search().count() + _es6_usage_event_count = CountedUsageEs6.search().count() _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count - _es8_count = OsfCountedUsageEvent.search().count() + _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) - self.stdout.write(f'{PreprintView.__name__} (es6):\t{_es6_pview_count}') - self.stdout.write(f'{PreprintDownload.__name__} (es6):\t{_es6_pdownload_count}') - self.stdout.write(f'{CountedAuthUsage.__name__} (es6):\t{_es6_pdownload_count}') + self.stdout.write(f'{PreprintViewEs6.__name__} (es6):\t{_es6_pview_count}') + self.stdout.write(f'{PreprintDownloadEs6.__name__} (es6):\t{_es6_pdownload_count}') + self.stdout.write(f'{CountedUsageEs6.__name__} (es6):\t{_es6_pdownload_count}') self.stdout.write(f'total (es6):\t{_es6_count}') - self.stdout.write(f'{OsfCountedUsageEvent.__name__}:\t{_style(_es8_count)}') - if start: + self.stdout.write(f'{es8_metrics.OsfCountedUsageRecord.__name__}:\t{_style(_es8_count)}') + if start: # schedule (per-day?) tasks (if --start) self.stdout.write(f'starting {_es6_cls.__name__} => {_es8_cls.__name__}') # TODO: migrate_usage_events.apply_async(...) def _handle_usage_reports(self, *, start: bool): - _es6_count = PublicItemUsageReport.search().count() - _es8_count = PublicItemUsageReportEs8.search().count() + # display total report counts + _es6_count = es6_reports.PublicItemUsageReport.search().count() + _es8_count = es8_metrics.PublicItemUsageReportEs8.search().count() _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) - self.stdout.write(f'{PublicItemUsageReport.__name__} (es6):\t{_es6_count}') - self.stdout.write(f'{PublicItemUsageReportEs8.__name__}:\t{_style(_es8_count)}') + self.stdout.write(f'{es6_reports.PublicItemUsageReport.__name__} (es6):\t{_es6_count}') + self.stdout.write(f'{es8_metrics.PublicItemUsageReportEs8.__name__}:\t{_style(_es8_count)}') + # display distinct item counts _item_count # (if --start) schedule task per item (by composite agg on es6 public usage reports) # each item-task iter thru reports oldest to newest, adding cumulative counts From 9ed70f3cda89f38b216455e3986f088f03814842 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 15 Apr 2026 11:55:33 -0400 Subject: [PATCH 045/100] quieter elastic logs --- conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/conftest.py b/conftest.py index 198316f1cc4..e80c4e5c566 100644 --- a/conftest.py +++ b/conftest.py @@ -43,6 +43,8 @@ def pytest_configure(config): 'transitions.core', 'MARKDOWN', 'elasticsearch', + 'elastic_transport', + 'elasticsearch_metrics', ] for logger_name in SILENT_LOGGERS: logging.getLogger(logger_name).setLevel(logging.CRITICAL) From be1ed2feec851748d16c0b55b20815ce3aa87917 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 15 Apr 2026 11:56:05 -0400 Subject: [PATCH 046/100] wip --- .../commands/metrics_es8_migration.py | 146 +++++++++++++++--- osf/metrics/es8_metrics.py | 35 +++++ 2 files changed, 156 insertions(+), 25 deletions(-) diff --git a/osf/management/commands/metrics_es8_migration.py b/osf/management/commands/metrics_es8_migration.py index c2f765eef72..ff6cdfe8b0f 100644 --- a/osf/management/commands/metrics_es8_migration.py +++ b/osf/management/commands/metrics_es8_migration.py @@ -1,7 +1,6 @@ import datetime import logging - from django.core.management import call_command from django.core.management.base import BaseCommand from elasticsearch6 import helpers as es6_helpers @@ -22,6 +21,10 @@ _logger = logging.getLogger(__name__) +_USAGE_MONTHS_BACK = 3 + +_MAX_CARDINALITY_PRECISION = 40000 # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html#_precision_control + _UNCHANGED_RECORDTYPES = { # reports es6_reports.StorageAddonUsage: es8_metrics.StorageAddonUsageEs8, @@ -40,7 +43,6 @@ RegistriesModerationMetrics: es8_metrics.RegistriesModerationMetricsEs8, } - def _debug_migrate(es8_client, each_new): for _each in each_new: print(_each) @@ -49,7 +51,6 @@ def _debug_migrate(es8_client, each_new): def _do_migrate(es8_client, each_new): es8_helpers.bulk(es8_client, each_new, ..., stats_only=True) - def _es6_scan(es6_recordtype, from_when: str, until_when: str): return es6_helpers.scan( es6_client, @@ -58,6 +59,46 @@ def _es6_scan(es6_recordtype, from_when: str, until_when: str): ) +def _es6_usage_report_counts() -> tuple[int, int]: + _search = ( + es6_reports.PublicItemUsageReport.search() + ) + _search.aggs.metric( + 'agg_item_count', + 'cardinality', + field='item_osfid', + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _total_count = _response.hits.total + _item_count = ( + _response.aggregations.agg_item_count.value + if 'agg_item_count' in _response.aggregations + else 0 + ) + return (_total_count, _item_count) + + +def _es8_usage_report_counts() -> tuple[int, int]: + _search = ( + es8_metrics.PublicItemUsageReportEs8.search() + ) + _search.aggs.metric( + 'agg_item_count', + 'cardinality', + field='item_osfid', + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _total_count = _response.hits.total.value + _item_count = ( + _response.aggregations.agg_item_count.value + if 'agg_item_count' in _response.aggregations + else 0 + ) + return (_total_count, _item_count) + + def _cycle_coverage_daily(report_date): ... @@ -114,7 +155,6 @@ def migrate_usage_reports(from_date, until_date): # add cumulative count ... - class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument( @@ -133,9 +173,22 @@ def add_arguments(self, parser): "--usage-reports", action="store_true", ) + parser.add_argument( + "--clear-state", + action="store_true", + ) + parser.add_argument( + "--no-setup", + action="store_true", + ) - def handle(self, *, start, unchanged, usage_events, usage_reports, **kwargs): - call_command('djelme_backend_setup') # ensure all index templates + def handle(self, *, start, unchanged, usage_events, usage_reports, clear_state, no_setup, **kwargs): + self._quiet_chatty_loggers() + if not no_setup: + call_command('djelme_backend_setup') + if clear_state: + self._clear_state() + self._display_started_at(start=start) _default_all = not any((unchanged, usage_events, usage_reports)) if unchanged or _default_all: self._handle_unchanged(start=start) @@ -150,11 +203,11 @@ def _handle_unchanged(self, *, start: bool): # display counts _es6_count = _es6_cls.search().count() _es8_count = _es8_cls.search().count() - _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) - self.stdout.write(f'{_es6_cls.__name__} (es6):\t{_es6_count}') - self.stdout.write(f'{_es8_cls.__name__}:\t{_style(_es8_count)}') + #_es8_count = _es8_cls.search().count() + self._write_tabbed('es6', _es6_cls, _es6_count) + self._write_tabbed('es8', _es8_cls, _es8_count, style=self._eq_style(_es8_count, _es6_count)) if start: # schedule task - self.stdout.write(f'starting {_es6_cls.__name__} => {_es8_cls.__name__}') + self._write_tabbed('starting', _es6_cls, '=>', _es8_cls) # TODO: migrate_unchanged_recordtype.apply_async(...) self.stdout.write('---') @@ -167,24 +220,67 @@ def _handle_usage_events(self, *, start: bool): _es6_usage_event_count = CountedUsageEs6.search().count() _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() - _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) - self.stdout.write(f'{PreprintViewEs6.__name__} (es6):\t{_es6_pview_count}') - self.stdout.write(f'{PreprintDownloadEs6.__name__} (es6):\t{_es6_pdownload_count}') - self.stdout.write(f'{CountedUsageEs6.__name__} (es6):\t{_es6_pdownload_count}') - self.stdout.write(f'total (es6):\t{_es6_count}') - self.stdout.write(f'{es8_metrics.OsfCountedUsageRecord.__name__}:\t{_style(_es8_count)}') + self._write_tabbed('es6', PreprintViewEs6, _es6_pview_count) + self._write_tabbed('es6', PreprintDownloadEs6, _es6_pdownload_count) + self._write_tabbed('es6', CountedUsageEs6, _es6_usage_event_count) + self._write_tabbed('es6', '(total to migrate)', _es6_count) + self._write_tabbed('es8', es8_metrics.OsfCountedUsageRecord, _es8_count, style=self._eq_style(_es8_count, _es6_count)) if start: # schedule (per-day?) tasks (if --start) - self.stdout.write(f'starting {_es6_cls.__name__} => {_es8_cls.__name__}') + self.stdout.write(f'starting usages => {es8_metrics.OsfCountedUsageRecord}') # TODO: migrate_usage_events.apply_async(...) + self.stdout.write('---') def _handle_usage_reports(self, *, start: bool): - # display total report counts - _es6_count = es6_reports.PublicItemUsageReport.search().count() - _es8_count = es8_metrics.PublicItemUsageReportEs8.search().count() - _style = (self.style.SUCCESS if (_es6_count == _es8_count) else self.style.NOTICE) - self.stdout.write(f'{es6_reports.PublicItemUsageReport.__name__} (es6):\t{_es6_count}') - self.stdout.write(f'{es8_metrics.PublicItemUsageReportEs8.__name__}:\t{_style(_es8_count)}') - # display distinct item counts - _item_count + # display counts of reports and distinct items + _es6_count, _es6_item_count = _es6_usage_report_counts() + _es8_count, _es8_item_count = _es8_usage_report_counts() + self._write_tabbed('es6', es6_reports.PublicItemUsageReport, _es6_count) + self._write_tabbed('es8', es8_metrics.PublicItemUsageReportEs8, _es8_count, style=self._eq_style(_es8_count, _es6_count)) + self._write_tabbed('es6', es6_reports.PublicItemUsageReport, '(items)', _es6_item_count) + self._write_tabbed('es8', es8_metrics.PublicItemUsageReportEs8, '(items)', _es8_item_count, + style=self._eq_style(_es8_item_count, _es6_item_count)) # (if --start) schedule task per item (by composite agg on es6 public usage reports) # each item-task iter thru reports oldest to newest, adding cumulative counts + if start: # schedule per-item tasks + self.stdout.write(f'starting per-item {es6_reports.PublicItemUsageReport} => {es8_metrics.PublicItemUsageReportEs8}') + # TODO: migrate_usage_events.apply_async(...) + self.stdout.write('---') + + def _display_started_at(self, start): + _started_at = es8_metrics.Elastic6To8State.get_started_at() + if _started_at: + self.stdout.write( + f'osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}' + ) + elif start: + _started_at = es8_metrics.Elastic6To8State.set_started_at_now() + self.stdout.write( + f'osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}' + ) + else: + self.stdout.write( + 'osf.metrics 6->8 migration not started nor starting (run with `--start` to start)' + ) + self.stdout.write('---') + + def _clear_state(self): + es8_metrics.Elastic6To8State.search().delete() + + def _eq_style(self, num: int, should_be: int): + return self.style.SUCCESS if (num == should_be) else self.style.NOTICE + + def _write_tabbed(self, *strables, style=None): + def _to_str(strable): + if isinstance(strable, type): + return strable.__name__ + return str(strable) + self.stdout.write('\t'.join(map(_to_str, strables)), style) + + def _quiet_chatty_loggers(self): + _chatty_loggers = [ + 'elasticsearch', + 'elastic_transport', + 'elasticsearch_metrics', + ] + for logger_name in _chatty_loggers: + logging.getLogger(logger_name).setLevel(logging.ERROR) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 436a1c62d46..8b5e9dd5bc8 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -346,3 +346,38 @@ class PrivateSpamMetricsReportEs8(djelme.CyclicRecord): preprint_oopspam_hammed: int preprint_akismet_flagged: int preprint_akismet_hammed: int + + +### +# data migration state + +class Elastic6To8State(djelme.DjelmeRecordtype): + """index for storing values helpful for keeping track of the elastic 6->8 data migration""" + UNIQUE_TOGETHER_FIELDS = ('key',) + key: str + value: str | None + timestamp: datetime.datetime = esdsl.mapped_field( + default_factory=lambda: datetime.datetime.now(datetime.UTC), + ) + + class Index: + name = 'osf_elastic6to8state' + + @classmethod + def get_by_key(cls, key: str): + _response = cls.search().query({'term': {'key': key}})[0].execute() + return _response[0] if _response else None + + @classmethod + def get_timestamp(cls, key: str) -> datetime.datetime | None: + _record = cls.get_by_key(key) + return _record.timestamp if _record else None + + @classmethod + def get_started_at(cls): + return cls.get_timestamp('started_at') + + @classmethod + def set_started_at_now(cls): + _record = cls.record(key='started_at') + return _record.timestamp From 64aeeaba0d84cf33d9c6726b86c8844f127520e3 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 15 Apr 2026 16:03:06 -0400 Subject: [PATCH 047/100] wip --- ...ics_es8_migration.py => migrate_metrics_6to8.py} | 13 ++++++------- poetry.lock | 6 +++--- pyproject.toml | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) rename osf/management/commands/{metrics_es8_migration.py => migrate_metrics_6to8.py} (99%) diff --git a/osf/management/commands/metrics_es8_migration.py b/osf/management/commands/migrate_metrics_6to8.py similarity index 99% rename from osf/management/commands/metrics_es8_migration.py rename to osf/management/commands/migrate_metrics_6to8.py index ff6cdfe8b0f..104caccfb6c 100644 --- a/osf/management/commands/metrics_es8_migration.py +++ b/osf/management/commands/migrate_metrics_6to8.py @@ -158,27 +158,27 @@ def migrate_usage_reports(from_date, until_date): class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument( - "--start", + "--no-setup", action="store_true", ) parser.add_argument( - "--unchanged", + "--clear-state", action="store_true", ) parser.add_argument( - "--usage-events", + "--start", action="store_true", ) parser.add_argument( - "--usage-reports", + "--unchanged", action="store_true", ) parser.add_argument( - "--clear-state", + "--usage-events", action="store_true", ) parser.add_argument( - "--no-setup", + "--usage-reports", action="store_true", ) @@ -203,7 +203,6 @@ def _handle_unchanged(self, *, start: bool): # display counts _es6_count = _es6_cls.search().count() _es8_count = _es8_cls.search().count() - #_es8_count = _es8_cls.search().count() self._write_tabbed('es6', _es6_cls, _es6_count) self._write_tabbed('es8', _es8_cls, _es8_count, style=self._eq_style(_es8_count, _es6_count)) if start: # schedule task diff --git a/poetry.lock b/poetry.lock index 90665bce81f..d86523f94de 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "8025d58e23b4e0c562e1d59c98b10ec936eb56e6" -resolved_reference = "8025d58e23b4e0c562e1d59c98b10ec936eb56e6" +reference = "e18f029c406d743d407f18fda8a133b261f9c4d2" +resolved_reference = "e18f029c406d743d407f18fda8a133b261f9c4d2" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "ef1d6d327f5557e43482793b276ccb6c5fd07989f27367af3a3736a8547b4d1a" +content-hash = "320d3eb4cd7f0f4c5d8cc698db51ee1bf4c37f8b8d41d21a86ca5cdb9b2e6b42" diff --git a/pyproject.toml b/pyproject.toml index 013df3f448d..a5c39d297d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "8025d58e23b4e0c562e1d59c98b10ec936eb56e6"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "e18f029c406d743d407f18fda8a133b261f9c4d2"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From 97cd5b7f125c2413393d1276dec710f54b24a33a Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 16 Apr 2026 10:46:00 -0400 Subject: [PATCH 048/100] wip --- ...ics_6to8.py => migrate_osfmetrics_6to8.py} | 261 +++++++++++++----- osf/metrics/es8_metrics.py | 2 +- poetry.lock | 6 +- pyproject.toml | 2 +- 4 files changed, 198 insertions(+), 73 deletions(-) rename osf/management/commands/{migrate_metrics_6to8.py => migrate_osfmetrics_6to8.py} (50%) diff --git a/osf/management/commands/migrate_metrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py similarity index 50% rename from osf/management/commands/migrate_metrics_6to8.py rename to osf/management/commands/migrate_osfmetrics_6to8.py index 104caccfb6c..2f4cbb28385 100644 --- a/osf/management/commands/migrate_metrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -1,5 +1,6 @@ import datetime import logging +from pprint import pprint from django.core.management import call_command from django.core.management.base import BaseCommand @@ -7,7 +8,6 @@ from elasticsearch8 import helpers as es8_helpers from elasticsearch_metrics.registry import djelme_registry from elasticsearch_metrics.imps import elastic8 as djel8me -from elasticsearch_metrics.util.timeparts import format_timeparts from framework.celery_tasks import app as celery_app from osf.metrics.preprint_metrics import ( @@ -17,6 +17,7 @@ from osf.metrics.counted_usage import CountedAuthUsage as CountedUsageEs6 from osf.metrics import reports as es6_reports from osf.metrics import es8_metrics, RegistriesModerationMetrics +from osf.metrics.utils import YearMonth _logger = logging.getLogger(__name__) @@ -43,57 +44,76 @@ RegistriesModerationMetrics: es8_metrics.RegistriesModerationMetricsEs8, } -def _debug_migrate(es8_client, each_new): + +def _delete_all(recordtype): + # TODO: REMOVE THIS + recordtype.search().query({"match_all": {}}).delete() + recordtype.refresh() + + +def _delete_all_es8(): + # TODO: REMOVE THIS + for _es8_recordtype in _UNCHANGED_RECORDTYPES.values(): + _delete_all(_es8_recordtype) + _delete_all(es8_metrics.PublicItemUsageReportEs8) + _delete_all(es8_metrics.OsfCountedUsageRecord) + + +def _debug_migrate(each_new): for _each in each_new: - print(_each) + pprint(_each.to_dict()) def _do_migrate(es8_client, each_new): es8_helpers.bulk(es8_client, each_new, ..., stats_only=True) -def _es6_scan(es6_recordtype, from_when: str, until_when: str): + +def _es6_scan_all(es6_recordtype): + return es6_helpers.scan( + es6_recordtype._get_connection(), + index=es6_recordtype._template_pattern, + ) + + +def _es6_scan_range(es6_recordtype, from_when: str, until_when: str): return es6_helpers.scan( - es6_client, + es6_recordtype._get_connection(), index=es6_recordtype._template_pattern, query={"range": {"timestamp": {"gte": from_when, "lt": until_when}}}, ) def _es6_usage_report_counts() -> tuple[int, int]: - _search = ( - es6_reports.PublicItemUsageReport.search() - ) + _search = es6_reports.PublicItemUsageReport.search() _search.aggs.metric( - 'agg_item_count', - 'cardinality', - field='item_osfid', + "agg_item_count", + "cardinality", + field="item_osfid", precision_threshold=_MAX_CARDINALITY_PRECISION, ) _response = _search.execute() _total_count = _response.hits.total _item_count = ( _response.aggregations.agg_item_count.value - if 'agg_item_count' in _response.aggregations + if "agg_item_count" in _response.aggregations else 0 ) return (_total_count, _item_count) def _es8_usage_report_counts() -> tuple[int, int]: - _search = ( - es8_metrics.PublicItemUsageReportEs8.search() - ) + _search = es8_metrics.PublicItemUsageReportEs8.search() _search.aggs.metric( - 'agg_item_count', - 'cardinality', - field='item_osfid', + "agg_item_count", + "cardinality", + field="item_osfid", precision_threshold=_MAX_CARDINALITY_PRECISION, ) _response = _search.execute() _total_count = _response.hits.total.value _item_count = ( _response.aggregations.agg_item_count.value - if 'agg_item_count' in _response.aggregations + if "agg_item_count" in _response.aggregations else 0 ) return (_total_count, _item_count) @@ -105,36 +125,95 @@ def _cycle_coverage_daily(report_date): ... def _cycle_coverage_monthly(report_yearmonth): ... -def _unchanged_report_kwargs(es6_recordtype, hit): +def _get_es6_field_names(es6_recordtype): + """ + adapted from DocumentBase._get_field_names in elasticsearch8.dsl + """ + for _field_name in es6_recordtype._doc_type.mapping: + _field = es6_recordtype._doc_type.mapping[_field_name] + if hasattr(_field, "_doc_class"): + for _sub_field in _get_es6_field_names(_field._doc_class): + yield f"{_field_name}.{_sub_field}" + else: + yield _field_name + + +def _assert_field_unchangedness(es6_recordtype, es8_recordtype): + _es6_fields = set(_get_es6_field_names(es6_recordtype)) + _es8_fields = set(es8_recordtype._get_field_names()) + + # remove fields intentionally removed/renamed in migration if issubclass(es6_recordtype, es6_reports.DailyReport): - _cycle_coverage = format_timeparts( - datetime.date.fromisoformat(hit.pop("report_date")), djel8me.DAILY - ) + assert issubclass(es8_recordtype, djel8me.CyclicRecord) + _es6_fields.remove("timestamp") + _es6_fields.remove("report_date") elif issubclass(es6_recordtype, es6_reports.MonthlyReport): - _cycle_coverage = format_timeparts(hit.pop("report_yearmonth"), djel8me.MONTHLY) - return { - **hit, - 'cycle_coverage': _cycle_coverage, - } + assert issubclass(es8_recordtype, djel8me.CyclicRecord) + _es6_fields.remove("timestamp") + _es6_fields.remove("report_yearmonth") + else: + assert issubclass(es8_recordtype, djel8me.EventRecord) + # remove fields intentionally added in migration + _es8_fields.remove("timeseries_timeparts") + if issubclass(es8_recordtype, djel8me.CyclicRecord): + _es8_fields.remove("created") + _es8_fields.remove("cycle_coverage") -@celery_app.task + # all remaining fields should match + assert _es6_fields == _es8_fields + + +# TODO: @celery_app.task def migrate_unchanged_recordtype( es6_recordtype_name: str, ): _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] + _assert_field_unchangedness(_es6_recordtype, _es8_recordtype) + _kwarg_converter = ( + _each_cyclicrecord_kwarg + if issubclass( + _es6_recordtype, (es6_reports.DailyReport, es6_reports.MonthlyReport) + ) + else _each_eventrecord_kwarg + ) def _each_new(): - for _hit in _es6_scan(_es6_recordtype, from_when, until_when): - breakpoint() + for _hit in _es6_scan_all(_es6_recordtype): + _es8_kwargs = dict(_kwarg_converter(_hit["_source"])) yield _es8_recordtype.record( - ..., - using=False, # saved in bulk + **_es8_kwargs, + using=False, # skip saving; save in bulk ) _debug_migrate(_each_new()) - # _do_migrate(_each_new()) + # TODO: _do_migrate(_es8_recordtype._get_connection(), _each_new()) + + +def _semverish_from_yearmonth(given_yearmonth: str): + _ym = YearMonth.from_str(given_yearmonth) + return f"{_ym.year}.{_ym.month}" + + +def _semverish_from_date(given_date: str): + _d = datetime.date.fromisoformat(given_date) + return f"{_d.year}.{_d.month}.{_d.day}" + + +def _each_cyclicrecord_kwarg(es6_source: dict): + for _key, _val in es6_source.items(): + if _key == "report_yearmonth": + yield ("cycle_coverage", _semverish_from_yearmonth(_val)) + elif _key == "report_date": + yield ("cycle_coverage", _semverish_from_date(_val)) + elif _key != "timestamp": + # skipping timestamp; on daily/monthly reports just copied from yearmonth/date + yield (_key, _val) + + +def _each_eventrecord_kwarg(es6_source) -> dict: + yield from es6_source.items() # no changes needed @celery_app.task @@ -155,6 +234,7 @@ def migrate_usage_reports(from_date, until_date): # add cumulative count ... + class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument( @@ -182,13 +262,23 @@ def add_arguments(self, parser): action="store_true", ) - def handle(self, *, start, unchanged, usage_events, usage_reports, clear_state, no_setup, **kwargs): + def handle( + self, + *, + start, + unchanged, + usage_events, + usage_reports, + clear_state, + no_setup, + **kwargs, + ): self._quiet_chatty_loggers() if not no_setup: - call_command('djelme_backend_setup') + call_command("djelme_backend_setup") if clear_state: self._clear_state() - self._display_started_at(start=start) + self._migration_start(start=start) _default_all = not any((unchanged, usage_events, usage_reports)) if unchanged or _default_all: self._handle_unchanged(start=start) @@ -203,12 +293,18 @@ def _handle_unchanged(self, *, start: bool): # display counts _es6_count = _es6_cls.search().count() _es8_count = _es8_cls.search().count() - self._write_tabbed('es6', _es6_cls, _es6_count) - self._write_tabbed('es8', _es8_cls, _es8_count, style=self._eq_style(_es8_count, _es6_count)) + self._write_tabbed("es6", _es6_cls, _es6_count) + self._write_tabbed( + "es8", + _es8_cls, + _es8_count, + style=self._eq_style(_es8_count, _es6_count), + ) if start: # schedule task - self._write_tabbed('starting', _es6_cls, '=>', _es8_cls) + self._write_tabbed("starting", _es6_cls, "=>", _es8_cls) + migrate_unchanged_recordtype(_es6_cls.__name__) # TODO: migrate_unchanged_recordtype.apply_async(...) - self.stdout.write('---') + self.stdout.write("---") def _handle_usage_events(self, *, start: bool): # for counted-usage events: @@ -219,67 +315,96 @@ def _handle_usage_events(self, *, start: bool): _es6_usage_event_count = CountedUsageEs6.search().count() _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() - self._write_tabbed('es6', PreprintViewEs6, _es6_pview_count) - self._write_tabbed('es6', PreprintDownloadEs6, _es6_pdownload_count) - self._write_tabbed('es6', CountedUsageEs6, _es6_usage_event_count) - self._write_tabbed('es6', '(total to migrate)', _es6_count) - self._write_tabbed('es8', es8_metrics.OsfCountedUsageRecord, _es8_count, style=self._eq_style(_es8_count, _es6_count)) + self._write_tabbed("es6", PreprintViewEs6, _es6_pview_count) + self._write_tabbed("es6", PreprintDownloadEs6, _es6_pdownload_count) + self._write_tabbed("es6", CountedUsageEs6, _es6_usage_event_count) + self._write_tabbed("es6", "(total to migrate)", _es6_count) + self._write_tabbed( + "es8", + es8_metrics.OsfCountedUsageRecord, + _es8_count, + style=self._eq_style(_es8_count, _es6_count), + ) if start: # schedule (per-day?) tasks (if --start) - self.stdout.write(f'starting usages => {es8_metrics.OsfCountedUsageRecord}') - # TODO: migrate_usage_events.apply_async(...) - self.stdout.write('---') + self.stdout.write(f"starting usages => {es8_metrics.OsfCountedUsageRecord}") + for _from_date, _until_date in _each_date_in_range(...): + migrate_usage_events(_from_date.isoformat(), _until_date.isoformat()) + # TODO: migrate_usage_events.apply_async(...) + self.stdout.write("---") def _handle_usage_reports(self, *, start: bool): # display counts of reports and distinct items _es6_count, _es6_item_count = _es6_usage_report_counts() _es8_count, _es8_item_count = _es8_usage_report_counts() - self._write_tabbed('es6', es6_reports.PublicItemUsageReport, _es6_count) - self._write_tabbed('es8', es8_metrics.PublicItemUsageReportEs8, _es8_count, style=self._eq_style(_es8_count, _es6_count)) - self._write_tabbed('es6', es6_reports.PublicItemUsageReport, '(items)', _es6_item_count) - self._write_tabbed('es8', es8_metrics.PublicItemUsageReportEs8, '(items)', _es8_item_count, - style=self._eq_style(_es8_item_count, _es6_item_count)) + self._write_tabbed("es6", es6_reports.PublicItemUsageReport, _es6_count) + self._write_tabbed( + "es8", + es8_metrics.PublicItemUsageReportEs8, + _es8_count, + style=self._eq_style(_es8_count, _es6_count), + ) + self._write_tabbed( + "es6", es6_reports.PublicItemUsageReport, "(items)", _es6_item_count + ) + self._write_tabbed( + "es8", + es8_metrics.PublicItemUsageReportEs8, + "(items)", + _es8_item_count, + style=self._eq_style(_es8_item_count, _es6_item_count), + ) # (if --start) schedule task per item (by composite agg on es6 public usage reports) # each item-task iter thru reports oldest to newest, adding cumulative counts if start: # schedule per-item tasks - self.stdout.write(f'starting per-item {es6_reports.PublicItemUsageReport} => {es8_metrics.PublicItemUsageReportEs8}') - # TODO: migrate_usage_events.apply_async(...) - self.stdout.write('---') + self.stdout.write( + f"starting per-item {es6_reports.PublicItemUsageReport} => {es8_metrics.PublicItemUsageReportEs8}" + ) + # TODO: migrate_usage_reports.apply_async(...) + self.stdout.write("---") - def _display_started_at(self, start): + def _migration_start(self, start): _started_at = es8_metrics.Elastic6To8State.get_started_at() if _started_at: self.stdout.write( - f'osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}' + f"osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}" ) elif start: _started_at = es8_metrics.Elastic6To8State.set_started_at_now() self.stdout.write( - f'osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}' + f"osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}" ) else: self.stdout.write( - 'osf.metrics 6->8 migration not started nor starting (run with `--start` to start)' + "osf.metrics 6->8 migration not started nor starting (run with `--start` to start)" ) - self.stdout.write('---') + self.stdout.write("---") def _clear_state(self): - es8_metrics.Elastic6To8State.search().delete() + self.stdout.write( + "clearing all migration state (start time, etc)", self.style.NOTICE + ) + es8_metrics.Elastic6To8State.search().query({"match_all": {}}).delete() + es8_metrics.Elastic6To8State.refresh() + # TODO: REMOVE THIS + self.stdout.write("deleting all migration target data in es8", self.style.ERROR) + _delete_all_es8() def _eq_style(self, num: int, should_be: int): - return self.style.SUCCESS if (num == should_be) else self.style.NOTICE + return self.style.SUCCESS if (num == should_be) else self.style.WARNING def _write_tabbed(self, *strables, style=None): def _to_str(strable): if isinstance(strable, type): return strable.__name__ return str(strable) - self.stdout.write('\t'.join(map(_to_str, strables)), style) + + self.stdout.write("\t".join(map(_to_str, strables)), style) def _quiet_chatty_loggers(self): _chatty_loggers = [ - 'elasticsearch', - 'elastic_transport', - 'elasticsearch_metrics', + "elasticsearch", + "elastic_transport", + "elasticsearch_metrics", ] for logger_name in _chatty_loggers: logging.getLogger(logger_name).setLevel(logging.ERROR) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 8b5e9dd5bc8..4980358dc5f 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -351,7 +351,7 @@ class PrivateSpamMetricsReportEs8(djelme.CyclicRecord): ### # data migration state -class Elastic6To8State(djelme.DjelmeRecordtype): +class Elastic6To8State(djelme.SimpleRecord): """index for storing values helpful for keeping track of the elastic 6->8 data migration""" UNIQUE_TOGETHER_FIELDS = ('key',) key: str diff --git a/poetry.lock b/poetry.lock index d86523f94de..df08934ef29 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "e18f029c406d743d407f18fda8a133b261f9c4d2" -resolved_reference = "e18f029c406d743d407f18fda8a133b261f9c4d2" +reference = "d7e0483972a58b940bec843679c2a8c9b8bcb75c" +resolved_reference = "d7e0483972a58b940bec843679c2a8c9b8bcb75c" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "320d3eb4cd7f0f4c5d8cc698db51ee1bf4c37f8b8d41d21a86ca5cdb9b2e6b42" +content-hash = "d149bb933fd3845714e26920360c34f3224ab0f84a789b3185cf716033a8d4bf" diff --git a/pyproject.toml b/pyproject.toml index a5c39d297d1..4b6f896f39e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "e18f029c406d743d407f18fda8a133b261f9c4d2"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "d7e0483972a58b940bec843679c2a8c9b8bcb75c"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From 7eba5cce220851ac479b74b67e1baa71e94d2c95 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 16 Apr 2026 16:25:28 -0400 Subject: [PATCH 049/100] wip --- .../commands/migrate_osfmetrics_6to8.py | 222 +++++++++++++----- osf/metrics/es8_metrics.py | 1 + 2 files changed, 167 insertions(+), 56 deletions(-) diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index 2f4cbb28385..812322657a9 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -1,4 +1,6 @@ +import collections import datetime +import functools import logging from pprint import pprint @@ -10,6 +12,7 @@ from elasticsearch_metrics.imps import elastic8 as djel8me from framework.celery_tasks import app as celery_app +from osf.metadata import rdfutils from osf.metrics.preprint_metrics import ( PreprintView as PreprintViewEs6, PreprintDownload as PreprintDownloadEs6, @@ -18,10 +21,14 @@ from osf.metrics import reports as es6_reports from osf.metrics import es8_metrics, RegistriesModerationMetrics from osf.metrics.utils import YearMonth +from website import settings as website_settings _logger = logging.getLogger(__name__) +### +# constants + _USAGE_MONTHS_BACK = 3 _MAX_CARDINALITY_PRECISION = 40000 # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html#_precision_control @@ -45,6 +52,68 @@ } +### +# celery tasks + + +# TODO: @celery_app.task +def migrate_unchanged_recordtype(es6_recordtype_name: str): + _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) + _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] + _assert_field_unchangedness(_es6_recordtype, _es8_recordtype) + + if issubclass(_es8_recordtype, djel8me.CyclicRecord): + + def _new_es8_record(source_dict): + _kwargs = dict(_convert_cyclicrecord_kwargs(source_dict)) + return _es8_recordtype(**_kwargs) + + else: # no conversion needed for event record with unchanged fields + + def _new_es8_record(source_dict): + return _es8_recordtype(**source_dict) + + def _each_new(): + for _hit in _es6_scan_all(_es6_recordtype): + yield _new_es8_record(_hit["_source"]) + + _debug_migrate(_each_new()) + # TODO: _do_migrate(_es8_recordtype._get_connection(), _each_new()) + + +# TODO: @celery_app.task +def migrate_counted_usages(from_when: str, until_when: str): + # CountedAuthUsage => OsfCountedUsageRecord + def _each_new(): + for _hit in _es6_scan_all(CountedUsageEs6, from_when, until_when): + yield _convert_counted_usage(_hit["_source"]) + + _debug_migrate(_each_new()) + + +# TODO: @celery_app.task +def migrate_preprint_views(from_date: str, until_date: str): + # convert to counted-usage + ... + + +# TODO: @celery_app.task +def migrate_preprint_downloads(from_date: str, until_date: str): + # convert to counted-usage + ... + + +# TODO: @celery_app.task +def migrate_usage_reports(from_date, until_date): + # from PublicItemUsageReport to PublicItemUsageReportEs8 + # add cumulative count + ... + + +### +# various helper functions + + def _delete_all(recordtype): # TODO: REMOVE THIS recordtype.search().query({"match_all": {}}).delete() @@ -60,6 +129,7 @@ def _delete_all_es8(): def _debug_migrate(each_new): + # TODO: remove this for _each in each_new: pprint(_each.to_dict()) @@ -68,6 +138,18 @@ def _do_migrate(es8_client, each_new): es8_helpers.bulk(es8_client, each_new, ..., stats_only=True) +def _date_range( + range_start: datetime.date, + range_end: datetime.date, + step: datetime.timedelta = datetime.timedelta(days=1), +) -> collections.abc.Iterator[tuple[datetime.date, datetime.date]]: + _from_date = range_start + _until_date = range_start + step + while _from_date < range_end: + yield (_from_date, _until_date) + (_from_date, _until_date) = (_until_date, _until_date + step) + + def _es6_scan_all(es6_recordtype): return es6_helpers.scan( es6_recordtype._get_connection(), @@ -119,12 +201,6 @@ def _es8_usage_report_counts() -> tuple[int, int]: return (_total_count, _item_count) -def _cycle_coverage_daily(report_date): ... - - -def _cycle_coverage_monthly(report_yearmonth): ... - - def _get_es6_field_names(es6_recordtype): """ adapted from DocumentBase._get_field_names in elasticsearch8.dsl @@ -164,33 +240,6 @@ def _assert_field_unchangedness(es6_recordtype, es8_recordtype): assert _es6_fields == _es8_fields -# TODO: @celery_app.task -def migrate_unchanged_recordtype( - es6_recordtype_name: str, -): - _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) - _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] - _assert_field_unchangedness(_es6_recordtype, _es8_recordtype) - _kwarg_converter = ( - _each_cyclicrecord_kwarg - if issubclass( - _es6_recordtype, (es6_reports.DailyReport, es6_reports.MonthlyReport) - ) - else _each_eventrecord_kwarg - ) - - def _each_new(): - for _hit in _es6_scan_all(_es6_recordtype): - _es8_kwargs = dict(_kwarg_converter(_hit["_source"])) - yield _es8_recordtype.record( - **_es8_kwargs, - using=False, # skip saving; save in bulk - ) - - _debug_migrate(_each_new()) - # TODO: _do_migrate(_es8_recordtype._get_connection(), _each_new()) - - def _semverish_from_yearmonth(given_yearmonth: str): _ym = YearMonth.from_str(given_yearmonth) return f"{_ym.year}.{_ym.month}" @@ -201,38 +250,85 @@ def _semverish_from_date(given_date: str): return f"{_d.year}.{_d.month}.{_d.day}" -def _each_cyclicrecord_kwarg(es6_source: dict): +def _convert_cyclicrecord_kwargs(es6_source: dict): for _key, _val in es6_source.items(): if _key == "report_yearmonth": + # report_yearmonth converts to cycle_coverage Y.M yield ("cycle_coverage", _semverish_from_yearmonth(_val)) elif _key == "report_date": + # report_date converts to cycle_coverage Y.M.D yield ("cycle_coverage", _semverish_from_date(_val)) elif _key != "timestamp": # skipping timestamp; on daily/monthly reports just copied from yearmonth/date yield (_key, _val) -def _each_eventrecord_kwarg(es6_source) -> dict: - yield from es6_source.items() # no changes needed +def _convert_counted_usage(source_dict) -> es8_metrics.OsfCountedUsageRecord: + _item_iri = _iri_from_osfid(source_dict["item_guid"]) + return es8_metrics.OsfCountedUsageRecord( + # fields from djelme.CountedUsageRecord + timestamp=source_dict["timestamp"], + sessionhour_id=source_dict["session_id"], + platform_iri=source_dict["platform_iri"], + # TODO: database_iri=provider iri + item_iri=_item_iri, + within_iris=[ + _item_iri, # correct mistake; make inclusive-within aggregations easier + *( + _iri_from_osfid(_within_osfid) + for _within_osfid in source_dict["surrounding_guids"] + ), + ], + # fields from OsfCountedUsageRecord + item_osfid=source_dict["item_guid"], + item_type=_convert_item_type(source_dict), + item_public=source_dict["item_public"], + provider_id=source_dict["provider_id"], + user_is_authenticated=source_dict["user_is_authenticated"], + action_labels=source_dict["action_labels"], + pageview_info=source_dict[ + "pageview_info" + ], # TODO: does this need the PageviewInfo object? + ) -@celery_app.task -def migrate_preprint_views(from_date, until_date): - # convert to counted-usage - ... +def _iri_from_osfid(osfid: str) -> str: + return f"{website_settings.DOMAIN}{osfid}" -@celery_app.task -def migrate_preprint_downloads(from_date, until_date): - # convert to counted-usage - ... +def _convert_item_type(es6_usage_dict): + """convert model-name item types to OSFMAP item types + previous item_types use `type(osf_model).__name__.lower()` + """ + _modelname = es6_usage_dict["item_type"] + assert isinstance(_modelname, str) + match _modelname: + case "osfuser": + return rdfutils.DCTERMS.Agent + case "preprint": + return rdfutils.OSF.Preprint + case "registration": + return ( + rdfutils.OSF.RegistrationComponent + if es6_usage_dict.get("surrounding_guids") + else rdfutils.OSF.Registration + ) + case "node": + return ( + rdfutils.OSF.ProjectComponent + if es6_usage_dict.get("surrounding_guids") + else rdfutils.OSF.Project + ) + case _ if "file" in _modelname: + return rdfutils.OSF.File + case _: + _logger.error(f"unknown item type: {_modelname}") + return _modelname # give up -@celery_app.task -def migrate_usage_reports(from_date, until_date): - # from PublicItemUsageReport to PublicItemUsageReportEs8 - # add cumulative count - ... + +### +# the command itself class Command(BaseCommand): @@ -278,7 +374,7 @@ def handle( call_command("djelme_backend_setup") if clear_state: self._clear_state() - self._migration_start(start=start) + self._check_started_at(start_now=start) _default_all = not any((unchanged, usage_events, usage_reports)) if unchanged or _default_all: self._handle_unchanged(start=start) @@ -327,9 +423,18 @@ def _handle_usage_events(self, *, start: bool): ) if start: # schedule (per-day?) tasks (if --start) self.stdout.write(f"starting usages => {es8_metrics.OsfCountedUsageRecord}") - for _from_date, _until_date in _each_date_in_range(...): - migrate_usage_events(_from_date.isoformat(), _until_date.isoformat()) - # TODO: migrate_usage_events.apply_async(...) + _started = self._migration_started_at + _range_start = ( + _started - datetime.timedelta(months=_USAGE_MONTHS_BACK) + ).date + _range_end = _started.date() + datetime.timedelta(days=1) + for _from_date, _until_date in _date_range(_range_start, _range_end): + _from_str = _from_date.isoformat() + _until_str = _until_date.isoformat() + # TODO: .apply_async(...) + migrate_counted_usages(_from_str, _until_str) + migrate_preprint_views(_from_str, _until_str) + migrate_preprint_downloads(_from_str, _until_str) self.stdout.write("---") def _handle_usage_reports(self, *, start: bool): @@ -362,13 +467,18 @@ def _handle_usage_reports(self, *, start: bool): # TODO: migrate_usage_reports.apply_async(...) self.stdout.write("---") - def _migration_start(self, start): - _started_at = es8_metrics.Elastic6To8State.get_started_at() + @functools.cached_property + def _migration_started_at(self): + return es8_metrics.Elastic6To8State.get_started_at() + + def _check_started_at(self, start_now): + _started_at = self._migration_started_at if _started_at: self.stdout.write( f"osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}" ) - elif start: + elif start_now: + del self._migration_started_at # clear cache _started_at = es8_metrics.Elastic6To8State.set_started_at_now() self.stdout.write( f"osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}" diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 4980358dc5f..3be81e9262e 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -89,6 +89,7 @@ class OsfCountedUsageRecord(djelme.CountedUsageRecord): item_osfid: str item_type: str item_public: bool + provider_id: str user_is_authenticated: bool action_labels: list[str] pageview_info: PageviewInfo From 7d554b66ba27447f79179c8e997427215302a11b Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 17 Apr 2026 14:14:20 -0400 Subject: [PATCH 050/100] wip --- .../commands/migrate_osfmetrics_6to8.py | 189 +++++++++++------- osf/metrics/es8_metrics.py | 9 +- poetry.lock | 6 +- pyproject.toml | 2 +- 4 files changed, 130 insertions(+), 76 deletions(-) diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index 812322657a9..f0e1147f025 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -7,15 +7,16 @@ from django.core.management import call_command from django.core.management.base import BaseCommand from elasticsearch6 import helpers as es6_helpers -from elasticsearch8 import helpers as es8_helpers +from elasticsearch6_dsl.connections import connections as es6_connections +from elasticsearch8.dsl.connections import connections as es8_connections from elasticsearch_metrics.registry import djelme_registry from elasticsearch_metrics.imps import elastic8 as djel8me from framework.celery_tasks import app as celery_app from osf.metadata import rdfutils from osf.metrics.preprint_metrics import ( - PreprintView as PreprintViewEs6, - PreprintDownload as PreprintDownloadEs6, + PreprintView, + PreprintDownload, ) from osf.metrics.counted_usage import CountedAuthUsage as CountedUsageEs6 from osf.metrics import reports as es6_reports @@ -29,7 +30,7 @@ ### # constants -_USAGE_MONTHS_BACK = 3 +_USAGE_DAYS_BACK = 99 _MAX_CARDINALITY_PRECISION = 40000 # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics-cardinality-aggregation.html#_precision_control @@ -61,58 +62,76 @@ def migrate_unchanged_recordtype(es6_recordtype_name: str): _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] _assert_field_unchangedness(_es6_recordtype, _es8_recordtype) - - if issubclass(_es8_recordtype, djel8me.CyclicRecord): - - def _new_es8_record(source_dict): - _kwargs = dict(_convert_cyclicrecord_kwargs(source_dict)) - return _es8_recordtype(**_kwargs) - - else: # no conversion needed for event record with unchanged fields - - def _new_es8_record(source_dict): - return _es8_recordtype(**source_dict) - - def _each_new(): - for _hit in _es6_scan_all(_es6_recordtype): - yield _new_es8_record(_hit["_source"]) - - _debug_migrate(_each_new()) - # TODO: _do_migrate(_es8_recordtype._get_connection(), _each_new()) + _convert_kwargs = ( + _convert_unchanged_cyclicrecord_kwargs + if issubclass(_es8_recordtype, djel8me.CyclicRecord) + else (lambda _kw: _kw) # no conversion needed for event record + ) + _each_new = ( + _es8_recordtype(**_convert_kwargs(_hit["_source"])) + for _hit in _es6_scan_all(_es6_recordtype) + ) + _debug_migrate(_each_new) + # return _es8_bulk_save(_es8_recordtype, _each_new) # TODO: @celery_app.task def migrate_counted_usages(from_when: str, until_when: str): # CountedAuthUsage => OsfCountedUsageRecord - def _each_new(): - for _hit in _es6_scan_all(CountedUsageEs6, from_when, until_when): - yield _convert_counted_usage(_hit["_source"]) - - _debug_migrate(_each_new()) + _each_new = ( + _convert_counted_usage(_hit["_source"]) + for _hit in _es6_scan_range(CountedUsageEs6, from_when, until_when) + ) + _debug_migrate(_each_new) + # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) # TODO: @celery_app.task -def migrate_preprint_views(from_date: str, until_date: str): - # convert to counted-usage - ... +def migrate_preprint_views(from_when: str, until_when: str): + # PreprintView => OsfCountedUsageRecord + _action_labels = ['view', 'web'] + _each_new = ( + _convert_preprint_metric(_hit["_source"], _action_labels) + for _hit in _es6_scan_range(PreprintView, from_when, until_when) + ) + _debug_migrate(_each_new) + # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) # TODO: @celery_app.task -def migrate_preprint_downloads(from_date: str, until_date: str): - # convert to counted-usage - ... +def migrate_preprint_downloads(from_when: str, until_when: str): + # PreprintDownload => OsfCountedUsageRecord + _action_labels = ['download'] + _each_new = ( + _convert_preprint_metric(_hit["_source"], _action_labels) + for _hit in _es6_scan_range(PreprintDownload, from_when, until_when) + ) + _debug_migrate(_each_new) + # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) # TODO: @celery_app.task -def migrate_usage_reports(from_date, until_date): +def migrate_usage_reports(osfid: str): # from PublicItemUsageReport to PublicItemUsageReportEs8 # add cumulative count - ... + def _each_new(): + for _hit in _es6_scan_all(CountedUsageEs6, query=...): + yield ...(_hit["_source"]) + + _debug_migrate(_each_new) + # TODO: return _es8_bulk_save(PublicItemUsageReportEs8, _each_new) ### # various helper functions +def _es6_connection(): + return es6_connections.get_connection('osfmetrics_es6') + + +def _es8_connection(): + return es8_connections.get_connection('osfmetrics_es8') + def _delete_all(recordtype): # TODO: REMOVE THIS @@ -131,11 +150,15 @@ def _delete_all_es8(): def _debug_migrate(each_new): # TODO: remove this for _each in each_new: - pprint(_each.to_dict()) + pprint(_each.to_dict(include_meta=True)) -def _do_migrate(es8_client, each_new): - es8_helpers.bulk(es8_client, each_new, ..., stats_only=True) +def _es8_bulk_save(es8_recordtype, each_new_record): + _success_count, _fail_count = es8_recordtype.bulk( + each_new_record, + stats_only=True, + ) + return _success_count def _date_range( @@ -150,18 +173,19 @@ def _date_range( (_from_date, _until_date) = (_until_date, _until_date + step) -def _es6_scan_all(es6_recordtype): +def _es6_scan_all(es6_recordtype, query=None): return es6_helpers.scan( - es6_recordtype._get_connection(), + _es6_connection(), index=es6_recordtype._template_pattern, + query=query, ) def _es6_scan_range(es6_recordtype, from_when: str, until_when: str): return es6_helpers.scan( - es6_recordtype._get_connection(), + _es6_connection(), index=es6_recordtype._template_pattern, - query={"range": {"timestamp": {"gte": from_when, "lt": until_when}}}, + query={"query": {"range": {"timestamp": {"gte": from_when, "lt": until_when}}}}, ) @@ -218,7 +242,7 @@ def _assert_field_unchangedness(es6_recordtype, es8_recordtype): _es6_fields = set(_get_es6_field_names(es6_recordtype)) _es8_fields = set(es8_recordtype._get_field_names()) - # remove fields intentionally removed/renamed in migration + # remove fields intentionally removed in migration if issubclass(es6_recordtype, es6_reports.DailyReport): assert issubclass(es8_recordtype, djel8me.CyclicRecord) _es6_fields.remove("timestamp") @@ -250,17 +274,19 @@ def _semverish_from_date(given_date: str): return f"{_d.year}.{_d.month}.{_d.day}" -def _convert_cyclicrecord_kwargs(es6_source: dict): - for _key, _val in es6_source.items(): - if _key == "report_yearmonth": - # report_yearmonth converts to cycle_coverage Y.M - yield ("cycle_coverage", _semverish_from_yearmonth(_val)) - elif _key == "report_date": - # report_date converts to cycle_coverage Y.M.D - yield ("cycle_coverage", _semverish_from_date(_val)) - elif _key != "timestamp": - # skipping timestamp; on daily/monthly reports just copied from yearmonth/date - yield (_key, _val) +def _convert_unchanged_cyclicrecord_kwargs(es6_source: dict) -> dict: + def _each_kwarg(): + for _key, _val in es6_source.items(): + if _key == "report_yearmonth": + # report_yearmonth converts to cycle_coverage Y.M + yield ("cycle_coverage", _semverish_from_yearmonth(_val)) + elif _key == "report_date": + # report_date converts to cycle_coverage Y.M.D + yield ("cycle_coverage", _semverish_from_date(_val)) + elif _key != "timestamp": + # skipping timestamp; on daily/monthly reports just copied from yearmonth/date + yield (_key, _val) + return dict(_each_kwarg()) def _convert_counted_usage(source_dict) -> es8_metrics.OsfCountedUsageRecord: @@ -276,19 +302,40 @@ def _convert_counted_usage(source_dict) -> es8_metrics.OsfCountedUsageRecord: _item_iri, # correct mistake; make inclusive-within aggregations easier *( _iri_from_osfid(_within_osfid) - for _within_osfid in source_dict["surrounding_guids"] + for _within_osfid in source_dict.get("surrounding_guids", ()) ), ], # fields from OsfCountedUsageRecord item_osfid=source_dict["item_guid"], item_type=_convert_item_type(source_dict), item_public=source_dict["item_public"], - provider_id=source_dict["provider_id"], + provider_id=source_dict.get("provider_id"), user_is_authenticated=source_dict["user_is_authenticated"], action_labels=source_dict["action_labels"], - pageview_info=source_dict[ - "pageview_info" - ], # TODO: does this need the PageviewInfo object? + # TODO: does this need the PageviewInfo object? + pageview_info=source_dict.get("pageview_info"), + ) + + +def _convert_preprint_metric(source_dict, action_labels: list[str]) -> es8_metrics.OsfCountedUsageRecord: + _preprint_iri = _iri_from_osfid(source_dict["preprint_id"]) + return es8_metrics.OsfCountedUsageRecord.record( + using=False, # don't save yet; will save in bulk + # fields used to compute a sessionhour_id: + timestamp=source_dict["timestamp"], + user_id=source_dict['user_id'], # TODO: handle None? + # fields from djelme.CountedUsageRecord: + platform_iri=website_settings.DOMAIN, + # TODO: database_iri=provider iri + item_iri=_preprint_iri, + within_iris=[_preprint_iri], + # fields from OsfCountedUsageRecord: + item_osfid=source_dict["preprint_id"], + item_type=rdfutils.OSF.Preprint, + item_public=True, + provider_id=source_dict["provider_id"], + user_is_authenticated=bool(source_dict["user_id"]), + action_labels=action_labels, ) @@ -301,7 +348,11 @@ def _convert_item_type(es6_usage_dict): previous item_types use `type(osf_model).__name__.lower()` """ - _modelname = es6_usage_dict["item_type"] + try: + _modelname = es6_usage_dict["item_type"] + except KeyError: + # this probably only happens in fake data + return None assert isinstance(_modelname, str) match _modelname: case "osfuser": @@ -320,11 +371,11 @@ def _convert_item_type(es6_usage_dict): if es6_usage_dict.get("surrounding_guids") else rdfutils.OSF.Project ) - case _ if "file" in _modelname: + case _ if "file" in _modelname: # hack for the many "filenode" models return rdfutils.OSF.File - case _: + case _: # give up gracefully _logger.error(f"unknown item type: {_modelname}") - return _modelname # give up + return _modelname ### @@ -406,13 +457,13 @@ def _handle_usage_events(self, *, start: bool): # for counted-usage events: # TODO: last X months only # display counts for each view/download event type - _es6_pview_count = PreprintViewEs6.search().count() - _es6_pdownload_count = PreprintDownloadEs6.search().count() + _es6_pview_count = PreprintView.search().count() + _es6_pdownload_count = PreprintDownload.search().count() _es6_usage_event_count = CountedUsageEs6.search().count() _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() - self._write_tabbed("es6", PreprintViewEs6, _es6_pview_count) - self._write_tabbed("es6", PreprintDownloadEs6, _es6_pdownload_count) + self._write_tabbed("es6", PreprintView, _es6_pview_count) + self._write_tabbed("es6", PreprintDownload, _es6_pdownload_count) self._write_tabbed("es6", CountedUsageEs6, _es6_usage_event_count) self._write_tabbed("es6", "(total to migrate)", _es6_count) self._write_tabbed( @@ -425,8 +476,8 @@ def _handle_usage_events(self, *, start: bool): self.stdout.write(f"starting usages => {es8_metrics.OsfCountedUsageRecord}") _started = self._migration_started_at _range_start = ( - _started - datetime.timedelta(months=_USAGE_MONTHS_BACK) - ).date + _started - datetime.timedelta(days=_USAGE_DAYS_BACK) + ).date() _range_end = _started.date() + datetime.timedelta(days=1) for _from_date, _until_date in _date_range(_range_start, _range_end): _from_str = _from_date.isoformat() diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 3be81e9262e..1824fcf2b3f 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -92,9 +92,10 @@ class OsfCountedUsageRecord(djelme.CountedUsageRecord): provider_id: str user_is_authenticated: bool action_labels: list[str] - pageview_info: PageviewInfo + pageview_info: PageviewInfo | None - def save(self, *args, **kwargs): + def clean(self): + super().clean() # autofill pageview_info fields if self.pageview_info: self.pageview_info.hour_of_day = self.timestamp.hour @@ -104,7 +105,9 @@ def save(self, *args, **kwargs): _ref_url = self.pageview_info.referer_url if _ref_url: self.pageview_info.referer_domain = urlsplit(_ref_url).netloc - super().save(*args, **kwargs) + # ensure inclusive "within" + if self.item_iri not in self.within_iris: + self.within_iris = [self.item_iri, *self.within_iris] class ActionLabel(enum.Enum): diff --git a/poetry.lock b/poetry.lock index df08934ef29..09ee8c9749b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "d7e0483972a58b940bec843679c2a8c9b8bcb75c" -resolved_reference = "d7e0483972a58b940bec843679c2a8c9b8bcb75c" +reference = "445fcea0aa6b5d07523cd67e959cb14088f15bb0" +resolved_reference = "445fcea0aa6b5d07523cd67e959cb14088f15bb0" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "d149bb933fd3845714e26920360c34f3224ab0f84a789b3185cf716033a8d4bf" +content-hash = "9aea963ca1a8b23c8e07fa22b34dc23c0f53d1d017edf29aad65a733ab4832fe" diff --git a/pyproject.toml b/pyproject.toml index 4b6f896f39e..a0a08b48047 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "d7e0483972a58b940bec843679c2a8c9b8bcb75c"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "445fcea0aa6b5d07523cd67e959cb14088f15bb0"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From 68b38bae8483eb349f785105dd887617e1b046d6 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 17 Apr 2026 17:28:12 -0400 Subject: [PATCH 051/100] wip --- .../commands/migrate_osfmetrics_6to8.py | 338 ++++++++++++++---- osf/metrics/es8_metrics.py | 55 ++- poetry.lock | 6 +- pyproject.toml | 2 +- 4 files changed, 320 insertions(+), 81 deletions(-) diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index f0e1147f025..acbc43df5dd 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -6,14 +6,18 @@ from django.core.management import call_command from django.core.management.base import BaseCommand +from django.db import OperationalError as DjangoOperationalError +from elasticsearch6.exceptions import ConnectionError as Elastic6ConnectionError from elasticsearch6 import helpers as es6_helpers from elasticsearch6_dsl.connections import connections as es6_connections +from elasticsearch8.exceptions import ConnectionError as Elastic8ConnectionError from elasticsearch8.dsl.connections import connections as es8_connections from elasticsearch_metrics.registry import djelme_registry from elasticsearch_metrics.imps import elastic8 as djel8me +from psycopg2 import OperationalError as PostgresOperationalError from framework.celery_tasks import app as celery_app -from osf.metadata import rdfutils +from osf.metadata.rdfutils import OSF, DCTERMS from osf.metrics.preprint_metrics import ( PreprintView, PreprintDownload, @@ -21,7 +25,9 @@ from osf.metrics.counted_usage import CountedAuthUsage as CountedUsageEs6 from osf.metrics import reports as es6_reports from osf.metrics import es8_metrics, RegistriesModerationMetrics +from osf.metrics.reporters.public_item_usage import _iter_composite_bucket_keys from osf.metrics.utils import YearMonth +from osf import models as osfdb from website import settings as website_settings @@ -52,12 +58,22 @@ RegistriesModerationMetrics: es8_metrics.RegistriesModerationMetricsEs8, } +_TASK_KWARGS = dict( + autoretry_for=( + DjangoOperationalError, + Elastic6ConnectionError, + Elastic8ConnectionError, + PostgresOperationalError, + ), + max_retries=50, + retry_backoff=True, +) ### # celery tasks -# TODO: @celery_app.task +@celery_app.task(**_TASK_KWARGS) def migrate_unchanged_recordtype(es6_recordtype_name: str): _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] @@ -71,66 +87,75 @@ def migrate_unchanged_recordtype(es6_recordtype_name: str): _es8_recordtype(**_convert_kwargs(_hit["_source"])) for _hit in _es6_scan_all(_es6_recordtype) ) - _debug_migrate(_each_new) - # return _es8_bulk_save(_es8_recordtype, _each_new) + # _debug_migrate(_each_new) + return _es8_bulk_save(_es8_recordtype, _each_new) -# TODO: @celery_app.task +@celery_app.task(**_TASK_KWARGS) def migrate_counted_usages(from_when: str, until_when: str): # CountedAuthUsage => OsfCountedUsageRecord _each_new = ( _convert_counted_usage(_hit["_source"]) - for _hit in _es6_scan_range(CountedUsageEs6, from_when, until_when) + for _hit in _es6_scan_range( + CountedUsageEs6, + from_when, + until_when, + addl_filter={"exists": {"field": "item_guid"}}, + ) ) - _debug_migrate(_each_new) - # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + # _debug_migrate(_each_new) + return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) -# TODO: @celery_app.task +@celery_app.task(**_TASK_KWARGS) def migrate_preprint_views(from_when: str, until_when: str): # PreprintView => OsfCountedUsageRecord - _action_labels = ['view', 'web'] + _action_labels = ["view", "web"] _each_new = ( _convert_preprint_metric(_hit["_source"], _action_labels) for _hit in _es6_scan_range(PreprintView, from_when, until_when) ) - _debug_migrate(_each_new) - # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + # _debug_migrate(_each_new) + return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) -# TODO: @celery_app.task +@celery_app.task(**_TASK_KWARGS) def migrate_preprint_downloads(from_when: str, until_when: str): # PreprintDownload => OsfCountedUsageRecord - _action_labels = ['download'] + _action_labels = ["download"] _each_new = ( _convert_preprint_metric(_hit["_source"], _action_labels) for _hit in _es6_scan_range(PreprintDownload, from_when, until_when) ) - _debug_migrate(_each_new) - # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + # _debug_migrate(_each_new) + return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) -# TODO: @celery_app.task +@celery_app.task(**_TASK_KWARGS) def migrate_usage_reports(osfid: str): # from PublicItemUsageReport to PublicItemUsageReportEs8 # add cumulative count def _each_new(): - for _hit in _es6_scan_all(CountedUsageEs6, query=...): - yield ...(_hit["_source"]) + for _hit in _es6_scan_all( + es6_reports.PublicItemUsageReport, + query_body={"query": {"term": {"item_osfid": osfid}}}, + ): + yield _convert_public_usage_report(_hit["_source"]) - _debug_migrate(_each_new) - # TODO: return _es8_bulk_save(PublicItemUsageReportEs8, _each_new) + # _debug_migrate(_each_new) + return _es8_bulk_save(es8_metrics.PublicItemUsageReportEs8, _each_new) ### # various helper functions + def _es6_connection(): - return es6_connections.get_connection('osfmetrics_es6') + return es6_connections.get_connection("osfmetrics_es6") def _es8_connection(): - return es8_connections.get_connection('osfmetrics_es8') + return es8_connections.get_connection("osfmetrics_es8") def _delete_all(recordtype): @@ -173,19 +198,24 @@ def _date_range( (_from_date, _until_date) = (_until_date, _until_date + step) -def _es6_scan_all(es6_recordtype, query=None): +def _es6_scan_all(es6_recordtype, query_body=None): return es6_helpers.scan( _es6_connection(), index=es6_recordtype._template_pattern, - query=query, + query=query_body, ) -def _es6_scan_range(es6_recordtype, from_when: str, until_when: str): +def _es6_scan_range(es6_recordtype, from_when: str, until_when: str, addl_filter=None): + _filters = [ + {"range": {"timestamp": {"gte": from_when, "lt": until_when}}}, + ] + if addl_filter: + _filters.append(addl_filter) return es6_helpers.scan( _es6_connection(), index=es6_recordtype._template_pattern, - query={"query": {"range": {"timestamp": {"gte": from_when, "lt": until_when}}}}, + query={"query": {"bool": {"filter": _filters}}}, ) @@ -286,59 +316,182 @@ def _each_kwarg(): elif _key != "timestamp": # skipping timestamp; on daily/monthly reports just copied from yearmonth/date yield (_key, _val) + return dict(_each_kwarg()) def _convert_counted_usage(source_dict) -> es8_metrics.OsfCountedUsageRecord: _item_iri = _iri_from_osfid(source_dict["item_guid"]) + _item_type = _convert_item_type(source_dict) return es8_metrics.OsfCountedUsageRecord( - # fields from djelme.CountedUsageRecord + # fields from djelme.CountedUsageRecord: timestamp=source_dict["timestamp"], sessionhour_id=source_dict["session_id"], platform_iri=source_dict["platform_iri"], - # TODO: database_iri=provider iri + database_iri=_convert_database_iri(source_dict.get("provider_id"), _item_type), item_iri=_item_iri, within_iris=[ - _item_iri, # correct mistake; make inclusive-within aggregations easier - *( - _iri_from_osfid(_within_osfid) - for _within_osfid in source_dict.get("surrounding_guids", ()) - ), + _iri_from_osfid(_within_osfid) + for _within_osfid in source_dict.get("surrounding_guids", ()) ], - # fields from OsfCountedUsageRecord + # fields from OsfCountedUsageRecord: item_osfid=source_dict["item_guid"], - item_type=_convert_item_type(source_dict), + item_type=_item_type, item_public=source_dict["item_public"], provider_id=source_dict.get("provider_id"), user_is_authenticated=source_dict["user_is_authenticated"], action_labels=source_dict["action_labels"], - # TODO: does this need the PageviewInfo object? + # TODO: does this need the PageviewInfo object or is the dictionary fine? pageview_info=source_dict.get("pageview_info"), ) -def _convert_preprint_metric(source_dict, action_labels: list[str]) -> es8_metrics.OsfCountedUsageRecord: +def _convert_preprint_metric( + source_dict, action_labels: list[str] +) -> es8_metrics.OsfCountedUsageRecord: _preprint_iri = _iri_from_osfid(source_dict["preprint_id"]) return es8_metrics.OsfCountedUsageRecord.record( using=False, # don't save yet; will save in bulk # fields used to compute a sessionhour_id: timestamp=source_dict["timestamp"], - user_id=source_dict['user_id'], # TODO: handle None? + user_id=source_dict.get("user_id"), # fields from djelme.CountedUsageRecord: platform_iri=website_settings.DOMAIN, - # TODO: database_iri=provider iri + database_iri=_convert_database_iri( + source_dict.get("provider_id"), OSF.Preprint + ), item_iri=_preprint_iri, within_iris=[_preprint_iri], # fields from OsfCountedUsageRecord: item_osfid=source_dict["preprint_id"], - item_type=rdfutils.OSF.Preprint, + item_type=OSF.Preprint, item_public=True, - provider_id=source_dict["provider_id"], - user_is_authenticated=bool(source_dict["user_id"]), + provider_id=source_dict.get("provider_id"), + user_is_authenticated=bool(source_dict.get("user_id")), action_labels=action_labels, ) +def _convert_public_usage_report(source_dict) -> es8_metrics.PublicItemUsageReportEs8: + _c_views, _c_view_sess, _c_downloads, _c_download_sess = _get_cumulative_usage( + osfid=source_dict["item_osfid"], + until_when=YearMonth.from_str(source_dict["report_yearmonth"]).month_end(), + item_type=source_dict.get("item_type"), + ) + return es8_metrics.PublicItemUsageReportEs8( + item_osfid=source_dict["item_osfid"], + item_type=source_dict.get("item_type"), + provider_id=source_dict.get("provider_id"), + platform_iri=source_dict.get("platform_iri"), + view_count=source_dict.get("view_count"), + view_session_count=source_dict.get("view_session_count"), + cumulative_view_count=_c_views, + cumulative_view_session_count=_c_view_sess, + download_count=source_dict.get("download_count"), + download_session_count=source_dict.get("download_session_count"), + cumulative_download_count=_c_downloads, + cumulative_download_session_count=_c_download_sess, + ) + + +def _get_cumulative_usage(osfid: str, until_when, item_type: str | None): + if item_type == "preprint": + _views = _cumulative_preprint_count(PreprintView, osfid, until_when) + _downloads = _cumulative_preprint_count(PreprintDownload, osfid, until_when) + _view_sess, _download_sess = 0, 0 # no session info on preprints (yet) + else: + _views, _view_sess = _cumulative_countedusage_views(osfid, until_when) + _downloads, _download_sess = _cumulative_countedusage_downloads( + osfid, until_when + ) + return (_views, _view_sess, _downloads, _download_sess) + + +def _cumulative_countedusage_views( + osfid: str, until_when: str +) -> tuple[int, int]: + """compute view_session_count separately to avoid double-counting + + (the same session may be represented in both the composite agg on `item_guid` + and that on `surrounding_guids`) + """ + # copied/adapted from osf.metrics.reporters.public_item_usage + _search = ( + CountedUsageEs6.search() + .filter("term", item_public=True) + .filter("range", timestamp={"lt": until_when}) + .filter("term", action_labels="view") + .filter( + "bool", + should=[ + {"term": {"item_guid": osfid}}, + {"term": {"surrounding_guids": osfid}}, + ], + minimum_should_match=1, + ) + .extra(size=0) # only aggregations, no hits + ) + _search.aggs.metric( + "agg_session_count", + "cardinality", + field="session_id", + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _view_count = _response.hits.total + _view_session_count = ( + _response.aggregations.agg_session_count.value + if "agg_session_count" in _response.aggregations + else 0 + ) + return (_view_count, _view_session_count) + + +def _cumulative_countedusage_downloads(osfid, until_when) -> tuple[int, int]: + """aggregate downloads on each osfid (not including components/files)""" + # copied/adapted from osf.metrics.reporters.public_item_usage + _search = ( + CountedUsageEs6.search() + .filter("term", item_public=True) + .filter("range", timestamp={"lt": until_when}) + .filter("term", action_labels="download") + .filter("term", item_guid=osfid) + ) + _search.aggs.metric( + "agg_session_count", + "cardinality", + field="session_id", + precision_threshold=_MAX_CARDINALITY_PRECISION, + ) + _response = _search.execute() + _download_count = _response.hits.total + _download_session_count = ( + _response.aggregations.agg_session_count.value + if "agg_session_count" in _response.aggregations + else 0 + ) + return (_download_count, _download_session_count) + + +def _cumulative_preprint_count(preprint_metric_cls, osfid: str, until_when: str) -> int: + """aggregate views on each preprint""" + # copied/adapted from osf.metrics.preprint_metrics + _search = ( + preprint_metric_cls.search() + .filter("term", preprint_id=osfid) + .filter("range", timestamp={"lt": until_when}) + .extra(size=0) # no hits; only aggs + ) + _search.aggs.metric("agg_count", "sum", field="count") + _response = _search.execute() + _view_count = ( + int(_response.aggregations.agg_count.value) + if hasattr(_response.aggregations, "agg_count") + else 0 + ) + return _view_count + + def _iri_from_osfid(osfid: str) -> str: return f"{website_settings.DOMAIN}{osfid}" @@ -348,34 +501,83 @@ def _convert_item_type(es6_usage_dict): previous item_types use `type(osf_model).__name__.lower()` """ - try: - _modelname = es6_usage_dict["item_type"] - except KeyError: - # this probably only happens in fake data - return None - assert isinstance(_modelname, str) + _modelname = es6_usage_dict.get("item_type") match _modelname: + case "" | None: + return OSF.Object case "osfuser": - return rdfutils.DCTERMS.Agent + return DCTERMS.Agent case "preprint": - return rdfutils.OSF.Preprint + return OSF.Preprint case "registration": return ( - rdfutils.OSF.RegistrationComponent + OSF.RegistrationComponent if es6_usage_dict.get("surrounding_guids") - else rdfutils.OSF.Registration + else OSF.Registration ) case "node": return ( - rdfutils.OSF.ProjectComponent + OSF.ProjectComponent if es6_usage_dict.get("surrounding_guids") - else rdfutils.OSF.Project + else OSF.Project ) case _ if "file" in _modelname: # hack for the many "filenode" models - return rdfutils.OSF.File + return OSF.File case _: # give up gracefully - _logger.error(f"unknown item type: {_modelname}") - return _modelname + return OSF.Object + + +@functools.lru_cache +def _convert_database_iri(provider_id: str | None, item_type_iri: str) -> str: + if not provider_id: + return website_settings.DOMAIN # osf is a provider, sure why not + + def _fallback_iri(): + return f"urn:osf.io:{provider_id}" + + match item_type_iri: + case OSF.ProjectComponent | OSF.Project | DCTERMS.Agent: + # implicit "osf" provider + return website_settings.DOMAIN + case OSF.Preprint: + try: + _provider = osfdb.PreprintProvider.objects.get(_id=provider_id) + except osfdb.PreprintProvider.DoesNotExist: + _logger.error(f"unknown preprint provider {provider_id!r}") + return _fallback_iri() + else: + return _provider.get_semantic_iri() + case OSF.RegistrationComponent | OSF.Registration: + try: + _provider = osfdb.RegistrationProvider.objects.get(_id=provider_id) + except osfdb.RegistrationProvider.DoesNotExist: + _logger.error(f"unknown registration provider {provider_id!r}") + return _fallback_iri() + else: + return _provider.get_semantic_iri() + case OSF.File: + # file providers are a different thing that don't really have an iri, just an id + return _fallback_iri() + case _: # give up gracefully + _logger.error( + f"unknown item type {item_type_iri!r} with provider {provider_id!r}" + ) + return _fallback_iri() + + +def _each_usage_report_osfid(started_at, after_osfid=None): + _search = ( + es6_reports.PublicItemUsageReport.search() + .filter("range", timestamp={"lt": started_at}) + .extra(size=0) + ) + _search.aggs.bucket( + "agg_osfid", + "composite", + sources=[{"osfid": {"terms": {"field": "item_osfid"}}}], + size=500, + ) + return _iter_composite_bucket_keys(_search, "agg_osfid", "osfid", after=after_osfid) ### @@ -449,8 +651,7 @@ def _handle_unchanged(self, *, start: bool): ) if start: # schedule task self._write_tabbed("starting", _es6_cls, "=>", _es8_cls) - migrate_unchanged_recordtype(_es6_cls.__name__) - # TODO: migrate_unchanged_recordtype.apply_async(...) + migrate_unchanged_recordtype.delay(_es6_cls.__name__) self.stdout.write("---") def _handle_usage_events(self, *, start: bool): @@ -475,17 +676,14 @@ def _handle_usage_events(self, *, start: bool): if start: # schedule (per-day?) tasks (if --start) self.stdout.write(f"starting usages => {es8_metrics.OsfCountedUsageRecord}") _started = self._migration_started_at - _range_start = ( - _started - datetime.timedelta(days=_USAGE_DAYS_BACK) - ).date() + _range_start = (_started - datetime.timedelta(days=_USAGE_DAYS_BACK)).date() _range_end = _started.date() + datetime.timedelta(days=1) for _from_date, _until_date in _date_range(_range_start, _range_end): _from_str = _from_date.isoformat() _until_str = _until_date.isoformat() - # TODO: .apply_async(...) - migrate_counted_usages(_from_str, _until_str) - migrate_preprint_views(_from_str, _until_str) - migrate_preprint_downloads(_from_str, _until_str) + migrate_counted_usages.delay(_from_str, _until_str) + migrate_preprint_views.delay(_from_str, _until_str) + migrate_preprint_downloads.delay(_from_str, _until_str) self.stdout.write("---") def _handle_usage_reports(self, *, start: bool): @@ -515,7 +713,11 @@ def _handle_usage_reports(self, *, start: bool): self.stdout.write( f"starting per-item {es6_reports.PublicItemUsageReport} => {es8_metrics.PublicItemUsageReportEs8}" ) - # TODO: migrate_usage_reports.apply_async(...) + for _osfid in _each_usage_report_osfid( + started_at=self._migration_started_at + ): + migrate_usage_reports(_osfid) + # TODO: migrate_usage_reports.apply_async(...) self.stdout.write("---") @functools.cached_property diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 1824fcf2b3f..fd8475b1bc3 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -77,19 +77,24 @@ class PageviewInfo(esdsl.InnerDoc): class OsfCountedUsageRecord(djelme.CountedUsageRecord): ''' - - inherited fields: - platform_iri: str - database_iri: str - item_iri: str - sessionhour_id: str - within_iris: list[str] + Aim to support a COUNTER-style reporting api + https://cop5.projectcounter.org/en/5.1/appendices/a-glossary-of-terms.html + https://coprd.countermetrics.org/en/1.0.1/appendices/a-glossary.html ''' - # osf-specific fields + + # inherited fields: + # timestamp: datetime.datetime + # platform_iri: str + # database_iri: str + # item_iri: str + # sessionhour_id: str + # within_iris: list[str] + + # osf-specific fields: item_osfid: str item_type: str item_public: bool - provider_id: str + provider_id: str | None user_is_authenticated: bool action_labels: list[str] pageview_info: PageviewInfo | None @@ -109,6 +114,38 @@ def clean(self): if self.item_iri not in self.within_iris: self.within_iris = [self.item_iri, *self.within_iris] + def _get_unique_together_values(self): + """get "unique together" values for "ON CONFLICT UPDATE" behavior + + override djelme.BaseDjelmeRecord._get_unique_together_values + for more complex logic than UNIQUE_TOGETHER_FIELDS + to slightly better approximate `counter:Double-Click Filtering` + """ + # note: copied from osf.metrics.counted_usage._fill_document_id + target_identifier = ( + self.pageview_info.page_url + if self.pageview_info is not None and self.pageview_info.page_url is not None + else self.item_osfid + ) + # slice the day into an array of 30-second windows, + # find this timestamp's windowslice index + day_start = datetime.datetime( + self.timestamp.year, + self.timestamp.month, + self.timestamp.day, + tzinfo=datetime.UTC, + ) + time_in_seconds = (self.timestamp - day_start).total_seconds() + time_window = int(time_in_seconds / 30) # 30-second windows + return ( # unique-together values: + self.platform_iri, + target_identifier, + self.sessionhour_id, + self.timestamp.date(), + time_window, + ','.join(sorted(self.action_labels)), + ) + class ActionLabel(enum.Enum): SEARCH = 'search' # counter:Search diff --git a/poetry.lock b/poetry.lock index 09ee8c9749b..14113d228b3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "445fcea0aa6b5d07523cd67e959cb14088f15bb0" -resolved_reference = "445fcea0aa6b5d07523cd67e959cb14088f15bb0" +reference = "a1e00e468830a40758caa8afa4b838821471f5c1" +resolved_reference = "a1e00e468830a40758caa8afa4b838821471f5c1" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "9aea963ca1a8b23c8e07fa22b34dc23c0f53d1d017edf29aad65a733ab4832fe" +content-hash = "1ba293f397fef29212fc58bfb8e08753f64bf43471a6fd2eb9d71bfded4ae326" diff --git a/pyproject.toml b/pyproject.toml index a0a08b48047..f7e6eb5bb41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "445fcea0aa6b5d07523cd67e959cb14088f15bb0"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "a1e00e468830a40758caa8afa4b838821471f5c1"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From 69daa8744cdc5d47c3ba3fec571c564624d39dbd Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 10:11:50 -0400 Subject: [PATCH 052/100] wip --- docker-compose.yml | 7 +- .../commands/fake_metrics_reports.py | 19 +++ .../commands/migrate_osfmetrics_6to8.py | 150 ++++++++++-------- osf/metrics/es8_metrics.py | 41 ++++- poetry.lock | 8 +- pyproject.toml | 2 +- website/settings/defaults.py | 1 + 7 files changed, 157 insertions(+), 71 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 83e8fd27483..d771c75797a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -72,6 +72,8 @@ services: # Temporary: Remove when we've upgraded to ES6 elasticsearch6: image: docker.elastic.co/elasticsearch/elasticsearch:6.3.1 + environment: + - ES_JAVA_OPTS=-Xms512m -Xmx512m # reduce memory usage ports: - 9201:9200 volumes: @@ -91,10 +93,9 @@ services: - elasticsearch8_data_vol:/usr/share/elasticsearch/data healthcheck: start_period: 15s - test: ["CMD", "curl", "-sf", "http://localhost:9200/_cluster/health?wait_for_status=yellow&timeout=30s"] + test: curl -s http://localhost:9200/_cluster/health | grep -vq '"status":"red"' interval: 10s - timeout: 30s - retries: 5 + retries: 30 stdin_open: true postgres: diff --git a/osf/management/commands/fake_metrics_reports.py b/osf/management/commands/fake_metrics_reports.py index 765d6e475c1..53e13472e74 100644 --- a/osf/management/commands/fake_metrics_reports.py +++ b/osf/management/commands/fake_metrics_reports.py @@ -8,6 +8,8 @@ UserSummaryReport, PreprintSummaryReport, ) +from osf.metrics.reports import PublicItemUsageReport +from osf.metrics.utils import YearMonth from osf.models import PreprintProvider @@ -53,10 +55,27 @@ def fake_preprint_counts(days_back): ).save() +def fake_usage_reports(osfid: str, count: int): + _ym = YearMonth.from_date(date.today()).prior() + for _months in range(count): + PublicItemUsageReport.record( + item_osfid=osfid, + report_yearmonth=_ym, + view_count=(_vc := randint(0, 500)), + view_session_count=randint(0, _vc), + download_count=(_dc := randint(0, 300)), + download_session_count=randint(0, _dc), + ) + _ym = _ym.prior() + + class Command(BaseCommand): def handle(self, *args, **kwargs): if not settings.DEBUG: raise NotImplementedError('fake_reports requires DEBUG mode') fake_user_counts(1000) fake_preprint_counts(1000) + fake_usage_reports('blarg', 100) + fake_usage_reports('blerg', 50) + fake_usage_reports('bleg', 50) # TODO: more reports diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index acbc43df5dd..c72765ab261 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -65,8 +65,8 @@ Elastic8ConnectionError, PostgresOperationalError, ), - max_retries=50, - retry_backoff=True, + retry_backoff=True, # exponential backoff, with jitter + max_retries=20, ) ### @@ -87,8 +87,8 @@ def migrate_unchanged_recordtype(es6_recordtype_name: str): _es8_recordtype(**_convert_kwargs(_hit["_source"])) for _hit in _es6_scan_all(_es6_recordtype) ) - # _debug_migrate(_each_new) - return _es8_bulk_save(_es8_recordtype, _each_new) + _debug_migrate(_each_new) + # return _es8_bulk_save(_es8_recordtype, _each_new) @celery_app.task(**_TASK_KWARGS) @@ -103,8 +103,8 @@ def migrate_counted_usages(from_when: str, until_when: str): addl_filter={"exists": {"field": "item_guid"}}, ) ) - # _debug_migrate(_each_new) - return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + _debug_migrate(_each_new) + #return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @celery_app.task(**_TASK_KWARGS) @@ -115,8 +115,8 @@ def migrate_preprint_views(from_when: str, until_when: str): _convert_preprint_metric(_hit["_source"], _action_labels) for _hit in _es6_scan_range(PreprintView, from_when, until_when) ) - # _debug_migrate(_each_new) - return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + _debug_migrate(_each_new) + # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @celery_app.task(**_TASK_KWARGS) @@ -127,8 +127,8 @@ def migrate_preprint_downloads(from_when: str, until_when: str): _convert_preprint_metric(_hit["_source"], _action_labels) for _hit in _es6_scan_range(PreprintDownload, from_when, until_when) ) - # _debug_migrate(_each_new) - return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + _debug_migrate(_each_new) + # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @celery_app.task(**_TASK_KWARGS) @@ -136,14 +136,23 @@ def migrate_usage_reports(osfid: str): # from PublicItemUsageReport to PublicItemUsageReportEs8 # add cumulative count def _each_new(): - for _hit in _es6_scan_all( + _each_hit = _es6_scan_all( es6_reports.PublicItemUsageReport, query_body={"query": {"term": {"item_osfid": osfid}}}, - ): - yield _convert_public_usage_report(_hit["_source"]) + ) + # only a few dozen of these per item; fine to hold all at once + _sorted_sources = sorted( + (_hit["_source"] for _hit in _each_hit), + key=lambda _s: _s["report_yearmonth"], + ) + _prior_report = None + for _source in _sorted_sources: + yield ( + _prior_report := _convert_public_usage_report(_source, _prior_report) + ) - # _debug_migrate(_each_new) - return _es8_bulk_save(es8_metrics.PublicItemUsageReportEs8, _each_new) + _debug_migrate(_each_new()) + # return _es8_bulk_save(es8_metrics.PublicItemUsageReportEs8, _each_new) ### @@ -175,6 +184,7 @@ def _delete_all_es8(): def _debug_migrate(each_new): # TODO: remove this for _each in each_new: + _each.full_clean() pprint(_each.to_dict(include_meta=True)) @@ -320,75 +330,89 @@ def _each_kwarg(): return dict(_each_kwarg()) -def _convert_counted_usage(source_dict) -> es8_metrics.OsfCountedUsageRecord: - _item_iri = _iri_from_osfid(source_dict["item_guid"]) - _item_type = _convert_item_type(source_dict) +def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageRecord: + _item_iri = _iri_from_osfid(source["item_guid"]) + _item_type = _convert_item_type(source) return es8_metrics.OsfCountedUsageRecord( # fields from djelme.CountedUsageRecord: - timestamp=source_dict["timestamp"], - sessionhour_id=source_dict["session_id"], - platform_iri=source_dict["platform_iri"], - database_iri=_convert_database_iri(source_dict.get("provider_id"), _item_type), + timestamp=source["timestamp"], + sessionhour_id=source["session_id"], + platform_iri=source.get("platform_iri") or website_settings.DOMAIN, + database_iri=_convert_database_iri(source.get("provider_id"), _item_type), item_iri=_item_iri, within_iris=[ _iri_from_osfid(_within_osfid) - for _within_osfid in source_dict.get("surrounding_guids", ()) + for _within_osfid in source.get("surrounding_guids", ()) ], # fields from OsfCountedUsageRecord: - item_osfid=source_dict["item_guid"], + item_osfid=source["item_guid"], item_type=_item_type, - item_public=source_dict["item_public"], - provider_id=source_dict.get("provider_id"), - user_is_authenticated=source_dict["user_is_authenticated"], - action_labels=source_dict["action_labels"], + item_public=source["item_public"], + provider_id=source.get("provider_id"), + user_is_authenticated=source["user_is_authenticated"], + action_labels=source["action_labels"], # TODO: does this need the PageviewInfo object or is the dictionary fine? - pageview_info=source_dict.get("pageview_info"), + pageview_info=source.get("pageview_info"), ) def _convert_preprint_metric( - source_dict, action_labels: list[str] + source: dict, action_labels: list[str] ) -> es8_metrics.OsfCountedUsageRecord: - _preprint_iri = _iri_from_osfid(source_dict["preprint_id"]) + _preprint_iri = _iri_from_osfid(source["preprint_id"]) return es8_metrics.OsfCountedUsageRecord.record( using=False, # don't save yet; will save in bulk # fields used to compute a sessionhour_id: - timestamp=source_dict["timestamp"], - user_id=source_dict.get("user_id"), + timestamp=source["timestamp"], + user_id=source.get("user_id"), # fields from djelme.CountedUsageRecord: platform_iri=website_settings.DOMAIN, - database_iri=_convert_database_iri( - source_dict.get("provider_id"), OSF.Preprint - ), + database_iri=_convert_database_iri(source.get("provider_id"), OSF.Preprint), item_iri=_preprint_iri, within_iris=[_preprint_iri], # fields from OsfCountedUsageRecord: - item_osfid=source_dict["preprint_id"], + item_osfid=source["preprint_id"], item_type=OSF.Preprint, item_public=True, - provider_id=source_dict.get("provider_id"), - user_is_authenticated=bool(source_dict.get("user_id")), + provider_id=source.get("provider_id"), + user_is_authenticated=bool(source.get("user_id")), action_labels=action_labels, ) -def _convert_public_usage_report(source_dict) -> es8_metrics.PublicItemUsageReportEs8: - _c_views, _c_view_sess, _c_downloads, _c_download_sess = _get_cumulative_usage( - osfid=source_dict["item_osfid"], - until_when=YearMonth.from_str(source_dict["report_yearmonth"]).month_end(), - item_type=source_dict.get("item_type"), - ) +def _convert_public_usage_report( + source: dict, + prior_report: es8_metrics.PublicItemUsageReportEs8 | None, +) -> es8_metrics.PublicItemUsageReportEs8: + if prior_report is None: + _c_views, _c_view_sess, _c_downloads, _c_download_sess = _get_cumulative_usage( + osfid=source["item_osfid"], + until_when=YearMonth.from_str(source["report_yearmonth"]).month_end(), + item_type=source.get("item_type"), + ) + else: + _c_views = prior_report.cumulative_view_count + source.get("view_count", 0) + _c_view_sess = prior_report.cumulative_view_session_count + source.get( + "view_session_count", 0 + ) + _c_downloads = prior_report.cumulative_download_count + source.get( + "download_count", 0 + ) + _c_download_sess = prior_report.cumulative_download_session_count + source.get( + "download_session_count", 0 + ) return es8_metrics.PublicItemUsageReportEs8( - item_osfid=source_dict["item_osfid"], - item_type=source_dict.get("item_type"), - provider_id=source_dict.get("provider_id"), - platform_iri=source_dict.get("platform_iri"), - view_count=source_dict.get("view_count"), - view_session_count=source_dict.get("view_session_count"), + cycle_coverage=_semverish_from_yearmonth(source['report_yearmonth']), + item_osfid=source["item_osfid"], + item_type=source.get("item_type"), + provider_id=source.get("provider_id"), + platform_iri=source.get("platform_iri") or website_settings.DOMAIN, + view_count=source.get("view_count"), + view_session_count=source.get("view_session_count"), cumulative_view_count=_c_views, cumulative_view_session_count=_c_view_sess, - download_count=source_dict.get("download_count"), - download_session_count=source_dict.get("download_session_count"), + download_count=source.get("download_count"), + download_session_count=source.get("download_session_count"), cumulative_download_count=_c_downloads, cumulative_download_session_count=_c_download_sess, ) @@ -407,9 +431,7 @@ def _get_cumulative_usage(osfid: str, until_when, item_type: str | None): return (_views, _view_sess, _downloads, _download_sess) -def _cumulative_countedusage_views( - osfid: str, until_when: str -) -> tuple[int, int]: +def _cumulative_countedusage_views(osfid: str, until_when: str) -> tuple[int, int]: """compute view_session_count separately to avoid double-counting (the same session may be represented in both the composite agg on `item_guid` @@ -651,7 +673,8 @@ def _handle_unchanged(self, *, start: bool): ) if start: # schedule task self._write_tabbed("starting", _es6_cls, "=>", _es8_cls) - migrate_unchanged_recordtype.delay(_es6_cls.__name__) + #migrate_unchanged_recordtype.delay(_es6_cls.__name__) + migrate_unchanged_recordtype(_es6_cls.__name__) self.stdout.write("---") def _handle_usage_events(self, *, start: bool): @@ -681,9 +704,12 @@ def _handle_usage_events(self, *, start: bool): for _from_date, _until_date in _date_range(_range_start, _range_end): _from_str = _from_date.isoformat() _until_str = _until_date.isoformat() - migrate_counted_usages.delay(_from_str, _until_str) - migrate_preprint_views.delay(_from_str, _until_str) - migrate_preprint_downloads.delay(_from_str, _until_str) + # migrate_counted_usages.delay(_from_str, _until_str) + # migrate_preprint_views.delay(_from_str, _until_str) + # migrate_preprint_downloads.delay(_from_str, _until_str) + migrate_counted_usages(_from_str, _until_str) + migrate_preprint_views(_from_str, _until_str) + migrate_preprint_downloads(_from_str, _until_str) self.stdout.write("---") def _handle_usage_reports(self, *, start: bool): @@ -709,7 +735,7 @@ def _handle_usage_reports(self, *, start: bool): ) # (if --start) schedule task per item (by composite agg on es6 public usage reports) # each item-task iter thru reports oldest to newest, adding cumulative counts - if start: # schedule per-item tasks + if start: self.stdout.write( f"starting per-item {es6_reports.PublicItemUsageReport} => {es8_metrics.PublicItemUsageReportEs8}" ) @@ -717,7 +743,7 @@ def _handle_usage_reports(self, *, start: bool): started_at=self._migration_started_at ): migrate_usage_reports(_osfid) - # TODO: migrate_usage_reports.apply_async(...) + # TODO: migrate_usage_reports.delay(...) self.stdout.write("---") @functools.cached_property diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index fd8475b1bc3..67fee676112 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -3,7 +3,7 @@ from urllib.parse import urlsplit import elasticsearch8.dsl as esdsl -from elasticsearch_metrics import DAILY, MONTHLY +from elasticsearch_metrics import DAILY, MONTHLY, YEARLY import elasticsearch_metrics.imps.elastic8 as djelme from osf.metrics.utils import YearMonth @@ -233,12 +233,18 @@ class StorageAddonUsageEs8(djelme.CyclicRecord): usage_by_addon: list[UsageByStorageAddon] + class Meta: + timeseries_index_timedepth = YEARLY + class DownloadCountReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY daily_file_downloads: int + class Meta: + timeseries_index_timedepth = YEARLY + class InstitutionSummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY @@ -252,6 +258,9 @@ class InstitutionSummaryReportEs8(djelme.CyclicRecord): registered_nodes: RegistrationRunningTotals registered_projects: RegistrationRunningTotals + class Meta: + timeseries_index_timedepth = MONTHLY + class NewUserDomainReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY @@ -260,6 +269,9 @@ class NewUserDomainReportEs8(djelme.CyclicRecord): domain_name: str new_user_count: int + class Meta: + timeseries_index_timedepth = MONTHLY + class NodeSummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY @@ -269,12 +281,18 @@ class NodeSummaryReportEs8(djelme.CyclicRecord): registered_nodes: RegistrationRunningTotals registered_projects: RegistrationRunningTotals + class Meta: + timeseries_index_timedepth = YEARLY + class OsfstorageFileCountReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY files: FileRunningTotals + class Meta: + timeseries_index_timedepth = YEARLY + class PreprintSummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY @@ -283,6 +301,9 @@ class PreprintSummaryReportEs8(djelme.CyclicRecord): provider_key: str preprint_count: int + class Meta: + timeseries_index_timedepth = MONTHLY + class UserSummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = DAILY @@ -294,6 +315,9 @@ class UserSummaryReportEs8(djelme.CyclicRecord): new_users_with_institution_daily: int unconfirmed: int + class Meta: + timeseries_index_timedepth = YEARLY + class SpamSummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = MONTHLY @@ -310,6 +334,9 @@ class SpamSummaryReportEs8(djelme.CyclicRecord): user_marked_as_spam: int user_marked_as_ham: int + class Meta: + timeseries_index_timedepth = YEARLY + class InstitutionalUserReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = MONTHLY @@ -333,6 +360,9 @@ class InstitutionalUserReportEs8(djelme.CyclicRecord): public_file_count: int = esdsl.mapped_field(esdsl.Long()) storage_byte_count: int = esdsl.mapped_field(esdsl.Long()) + class Meta: + timeseries_index_timedepth = MONTHLY + class InstitutionMonthlySummaryReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = MONTHLY @@ -350,6 +380,9 @@ class InstitutionMonthlySummaryReportEs8(djelme.CyclicRecord): monthly_logged_in_user_count: int = esdsl.mapped_field(esdsl.Long()) monthly_active_user_count: int = esdsl.mapped_field(esdsl.Long()) + class Meta: + timeseries_index_timedepth = YEARLY + class PublicItemUsageReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = MONTHLY @@ -375,6 +408,9 @@ class PublicItemUsageReportEs8(djelme.CyclicRecord): cumulative_download_count: int = esdsl.mapped_field(esdsl.Long()) cumulative_download_session_count: int = esdsl.mapped_field(esdsl.Long()) + class Meta: + timeseries_index_timedepth = MONTHLY + class PrivateSpamMetricsReportEs8(djelme.CyclicRecord): CYCLE_TIMEDEPTH = MONTHLY @@ -388,6 +424,9 @@ class PrivateSpamMetricsReportEs8(djelme.CyclicRecord): preprint_akismet_flagged: int preprint_akismet_hammed: int + class Meta: + timeseries_index_timedepth = YEARLY + ### # data migration state diff --git a/poetry.lock b/poetry.lock index 14113d228b3..7aee4eca49f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1085,7 +1085,7 @@ Django = ">=2.0" [[package]] name = "django-elasticsearch-metrics" -version = "2026.0.3" +version = "2026.0.4" description = "Django app for storing time-series metrics in Elasticsearch." optional = false python-versions = ">=3.10,<4" @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "a1e00e468830a40758caa8afa4b838821471f5c1" -resolved_reference = "a1e00e468830a40758caa8afa4b838821471f5c1" +reference = "fed3c14f213642284a197ac2933106cdafede25b" +resolved_reference = "fed3c14f213642284a197ac2933106cdafede25b" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "1ba293f397fef29212fc58bfb8e08753f64bf43471a6fd2eb9d71bfded4ae326" +content-hash = "0f9c547a6309aa915b25f9a7a98e5d0c15c867d577a883547d894ca173cb2344" diff --git a/pyproject.toml b/pyproject.toml index f7e6eb5bb41..b04e0540d90 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "a1e00e468830a40758caa8afa4b838821471f5c1"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "fed3c14f213642284a197ac2933106cdafede25b"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" diff --git a/website/settings/defaults.py b/website/settings/defaults.py index 3053f9d1075..dc69126ca37 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -557,6 +557,7 @@ class CeleryConfig: task_routes = ('framework.celery_tasks.routers.CeleryRouter', ) task_ignore_result = True task_store_errors_even_if_ignored = True + result_extended = True broker_url = os.environ.get('BROKER_URL', f'amqp://{RABBITMQ_USERNAME}:{RABBITMQ_PASSWORD}@{RABBITMQ_HOST}:{RABBITMQ_PORT}/{RABBITMQ_VHOST}') broker_use_ssl = False From da7910a86760bfd9a5d581f2c9b35692d5d30670 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 12:11:55 -0400 Subject: [PATCH 053/100] wip --- .../commands/migrate_osfmetrics_6to8.py | 248 ++++++++---------- osf/metrics/es8_metrics.py | 24 +- website/settings/defaults.py | 1 + 3 files changed, 125 insertions(+), 148 deletions(-) diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index c72765ab261..5ee937e80c0 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -17,7 +17,6 @@ from psycopg2 import OperationalError as PostgresOperationalError from framework.celery_tasks import app as celery_app -from osf.metadata.rdfutils import OSF, DCTERMS from osf.metrics.preprint_metrics import ( PreprintView, PreprintDownload, @@ -87,8 +86,7 @@ def migrate_unchanged_recordtype(es6_recordtype_name: str): _es8_recordtype(**_convert_kwargs(_hit["_source"])) for _hit in _es6_scan_all(_es6_recordtype) ) - _debug_migrate(_each_new) - # return _es8_bulk_save(_es8_recordtype, _each_new) + return _es8_bulk_save(_es8_recordtype, _each_new) @celery_app.task(**_TASK_KWARGS) @@ -103,8 +101,7 @@ def migrate_counted_usages(from_when: str, until_when: str): addl_filter={"exists": {"field": "item_guid"}}, ) ) - _debug_migrate(_each_new) - #return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @celery_app.task(**_TASK_KWARGS) @@ -115,8 +112,7 @@ def migrate_preprint_views(from_when: str, until_when: str): _convert_preprint_metric(_hit["_source"], _action_labels) for _hit in _es6_scan_range(PreprintView, from_when, until_when) ) - _debug_migrate(_each_new) - # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @celery_app.task(**_TASK_KWARGS) @@ -127,32 +123,31 @@ def migrate_preprint_downloads(from_when: str, until_when: str): _convert_preprint_metric(_hit["_source"], _action_labels) for _hit in _es6_scan_range(PreprintDownload, from_when, until_when) ) - _debug_migrate(_each_new) - # return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) + return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @celery_app.task(**_TASK_KWARGS) def migrate_usage_reports(osfid: str): # from PublicItemUsageReport to PublicItemUsageReportEs8 - # add cumulative count def _each_new(): + # go in sorted order to build cumulative counts + # (only a few dozen of these per item; should be fine to sort and load all at once) _each_hit = _es6_scan_all( es6_reports.PublicItemUsageReport, - query_body={"query": {"term": {"item_osfid": osfid}}}, - ) - # only a few dozen of these per item; fine to hold all at once - _sorted_sources = sorted( - (_hit["_source"] for _hit in _each_hit), - key=lambda _s: _s["report_yearmonth"], + query_body={ + "query": {"term": {"item_osfid": osfid}}, + "sort": "report_yearmonth", + }, ) _prior_report = None - for _source in _sorted_sources: + for _hit in list(_each_hit): yield ( - _prior_report := _convert_public_usage_report(_source, _prior_report) + _prior_report := _convert_public_usage_report( + _hit["_source"], _prior_report + ) ) - _debug_migrate(_each_new()) - # return _es8_bulk_save(es8_metrics.PublicItemUsageReportEs8, _each_new) + return _es8_bulk_save(es8_metrics.PublicItemUsageReportEs8, _each_new()) ### @@ -332,13 +327,12 @@ def _each_kwarg(): def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageRecord: _item_iri = _iri_from_osfid(source["item_guid"]) - _item_type = _convert_item_type(source) return es8_metrics.OsfCountedUsageRecord( # fields from djelme.CountedUsageRecord: timestamp=source["timestamp"], sessionhour_id=source["session_id"], platform_iri=source.get("platform_iri") or website_settings.DOMAIN, - database_iri=_convert_database_iri(source.get("provider_id"), _item_type), + database_iri=_convert_database_iri(source.get("provider_id"), source.get("item_type")), item_iri=_item_iri, within_iris=[ _iri_from_osfid(_within_osfid) @@ -346,11 +340,11 @@ def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageRecord: ], # fields from OsfCountedUsageRecord: item_osfid=source["item_guid"], - item_type=_item_type, - item_public=source["item_public"], + item_type=source.get("item_type", "osf:Object"), + item_public=source.get("item_public"), provider_id=source.get("provider_id"), - user_is_authenticated=source["user_is_authenticated"], - action_labels=source["action_labels"], + user_is_authenticated=source.get("user_is_authenticated"), + action_labels=source.get("action_labels"), # TODO: does this need the PageviewInfo object or is the dictionary fine? pageview_info=source.get("pageview_info"), ) @@ -367,12 +361,12 @@ def _convert_preprint_metric( user_id=source.get("user_id"), # fields from djelme.CountedUsageRecord: platform_iri=website_settings.DOMAIN, - database_iri=_convert_database_iri(source.get("provider_id"), OSF.Preprint), + database_iri=_convert_database_iri(source.get("provider_id"), "preprint"), item_iri=_preprint_iri, within_iris=[_preprint_iri], # fields from OsfCountedUsageRecord: item_osfid=source["preprint_id"], - item_type=OSF.Preprint, + item_type="preprint", item_public=True, provider_id=source.get("provider_id"), user_is_authenticated=bool(source.get("user_id")), @@ -402,7 +396,7 @@ def _convert_public_usage_report( "download_session_count", 0 ) return es8_metrics.PublicItemUsageReportEs8( - cycle_coverage=_semverish_from_yearmonth(source['report_yearmonth']), + cycle_coverage=_semverish_from_yearmonth(source["report_yearmonth"]), item_osfid=source["item_osfid"], item_type=source.get("item_type"), provider_id=source.get("provider_id"), @@ -518,50 +512,19 @@ def _iri_from_osfid(osfid: str) -> str: return f"{website_settings.DOMAIN}{osfid}" -def _convert_item_type(es6_usage_dict): - """convert model-name item types to OSFMAP item types - - previous item_types use `type(osf_model).__name__.lower()` - """ - _modelname = es6_usage_dict.get("item_type") - match _modelname: - case "" | None: - return OSF.Object - case "osfuser": - return DCTERMS.Agent - case "preprint": - return OSF.Preprint - case "registration": - return ( - OSF.RegistrationComponent - if es6_usage_dict.get("surrounding_guids") - else OSF.Registration - ) - case "node": - return ( - OSF.ProjectComponent - if es6_usage_dict.get("surrounding_guids") - else OSF.Project - ) - case _ if "file" in _modelname: # hack for the many "filenode" models - return OSF.File - case _: # give up gracefully - return OSF.Object - - @functools.lru_cache -def _convert_database_iri(provider_id: str | None, item_type_iri: str) -> str: +def _convert_database_iri(provider_id: str | None, item_type: str) -> str: if not provider_id: return website_settings.DOMAIN # osf is a provider, sure why not def _fallback_iri(): return f"urn:osf.io:{provider_id}" - match item_type_iri: - case OSF.ProjectComponent | OSF.Project | DCTERMS.Agent: + match item_type: # lower-cased osf.models class names + case "node" | "osfuser": # implicit "osf" provider return website_settings.DOMAIN - case OSF.Preprint: + case "preprint": try: _provider = osfdb.PreprintProvider.objects.get(_id=provider_id) except osfdb.PreprintProvider.DoesNotExist: @@ -569,7 +532,7 @@ def _fallback_iri(): return _fallback_iri() else: return _provider.get_semantic_iri() - case OSF.RegistrationComponent | OSF.Registration: + case "registration": try: _provider = osfdb.RegistrationProvider.objects.get(_id=provider_id) except osfdb.RegistrationProvider.DoesNotExist: @@ -577,12 +540,12 @@ def _fallback_iri(): return _fallback_iri() else: return _provider.get_semantic_iri() - case OSF.File: + case _ if "file" in item_type: # file providers are a different thing that don't really have an iri, just an id - return _fallback_iri() + return f"urn:files.osf.io:{provider_id}" case _: # give up gracefully _logger.error( - f"unknown item type {item_type_iri!r} with provider {provider_id!r}" + f"unknown item type {item_type!r} with provider {provider_id!r}" ) return _fallback_iri() @@ -612,6 +575,10 @@ def add_arguments(self, parser): "--no-setup", action="store_true", ) + parser.add_argument( + "--no-counts", + action="store_true", + ) parser.add_argument( "--clear-state", action="store_true", @@ -636,12 +603,13 @@ def add_arguments(self, parser): def handle( self, *, + no_setup, + no_counts, + clear_state, start, unchanged, usage_events, usage_reports, - clear_state, - no_setup, **kwargs, ): self._quiet_chatty_loggers() @@ -652,99 +620,94 @@ def handle( self._check_started_at(start_now=start) _default_all = not any((unchanged, usage_events, usage_reports)) if unchanged or _default_all: - self._handle_unchanged(start=start) + self._handle_unchanged(start=start, no_counts=no_counts) if usage_events or _default_all: - self._handle_usage_events(start=start) + self._handle_usage_events(start=start, no_counts=no_counts) if usage_reports or _default_all: - self._handle_usage_reports(start=start) + self._handle_usage_reports(start=start, no_counts=no_counts) - def _handle_unchanged(self, *, start: bool): + def _handle_unchanged(self, *, start: bool, no_counts: bool): # for each (unchanged) report/event: for _es6_cls, _es8_cls in _UNCHANGED_RECORDTYPES.items(): - # display counts - _es6_count = _es6_cls.search().count() - _es8_count = _es8_cls.search().count() - self._write_tabbed("es6", _es6_cls, _es6_count) + if not no_counts: + # display counts + _es6_count = _es6_cls.search().count() + _es8_count = _es8_cls.search().count() + self._write_tabbed("es6", _es6_cls, _es6_count) + self._write_tabbed( + "es8", + _es8_cls, + _es8_count, + style=self._eq_style(_es8_count, _es6_count), + ) + if start: # schedule task + self.stdout.write(f"starting {_es6_cls.__name__} => {_es8_cls.__name__}") + migrate_unchanged_recordtype.delay(_es6_cls.__name__) + + def _handle_usage_events(self, *, start: bool, no_counts: bool): + # for counted-usage events: + _started = self._migration_started_at + _range_start = (_started - datetime.timedelta(days=_USAGE_DAYS_BACK)).date() + _range_end = _started.date() + datetime.timedelta(days=1) + if not no_counts: + # display counts for each view/download event type + _range_q = {"range": {"timestamp": {"gte": _range_start.isoformat(), "lt": _range_end.isoformat()}}} + _es6_pview_count = PreprintView.search().filter(_range_q).count() + _es6_pdownload_count = PreprintDownload.search().filter(_range_q).count() + _es6_usage_event_count = CountedUsageEs6.search().filter(_range_q).count() + _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count + _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() + self._write_tabbed("es6", PreprintView, _es6_pview_count) + self._write_tabbed("es6", PreprintDownload, _es6_pdownload_count) + self._write_tabbed("es6", CountedUsageEs6, _es6_usage_event_count) + self._write_tabbed("es6", f"(total between {_range_start} and {_range_end})", _es6_count) self._write_tabbed( "es8", - _es8_cls, + es8_metrics.OsfCountedUsageRecord, _es8_count, style=self._eq_style(_es8_count, _es6_count), ) - if start: # schedule task - self._write_tabbed("starting", _es6_cls, "=>", _es8_cls) - #migrate_unchanged_recordtype.delay(_es6_cls.__name__) - migrate_unchanged_recordtype(_es6_cls.__name__) - self.stdout.write("---") - - def _handle_usage_events(self, *, start: bool): - # for counted-usage events: - # TODO: last X months only - # display counts for each view/download event type - _es6_pview_count = PreprintView.search().count() - _es6_pdownload_count = PreprintDownload.search().count() - _es6_usage_event_count = CountedUsageEs6.search().count() - _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count - _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() - self._write_tabbed("es6", PreprintView, _es6_pview_count) - self._write_tabbed("es6", PreprintDownload, _es6_pdownload_count) - self._write_tabbed("es6", CountedUsageEs6, _es6_usage_event_count) - self._write_tabbed("es6", "(total to migrate)", _es6_count) - self._write_tabbed( - "es8", - es8_metrics.OsfCountedUsageRecord, - _es8_count, - style=self._eq_style(_es8_count, _es6_count), - ) if start: # schedule (per-day?) tasks (if --start) - self.stdout.write(f"starting usages => {es8_metrics.OsfCountedUsageRecord}") - _started = self._migration_started_at - _range_start = (_started - datetime.timedelta(days=_USAGE_DAYS_BACK)).date() - _range_end = _started.date() + datetime.timedelta(days=1) + self.stdout.write(f"starting usages => {es8_metrics.OsfCountedUsageRecord.__name__}") for _from_date, _until_date in _date_range(_range_start, _range_end): _from_str = _from_date.isoformat() _until_str = _until_date.isoformat() - # migrate_counted_usages.delay(_from_str, _until_str) - # migrate_preprint_views.delay(_from_str, _until_str) - # migrate_preprint_downloads.delay(_from_str, _until_str) - migrate_counted_usages(_from_str, _until_str) - migrate_preprint_views(_from_str, _until_str) - migrate_preprint_downloads(_from_str, _until_str) - self.stdout.write("---") - - def _handle_usage_reports(self, *, start: bool): - # display counts of reports and distinct items - _es6_count, _es6_item_count = _es6_usage_report_counts() - _es8_count, _es8_item_count = _es8_usage_report_counts() - self._write_tabbed("es6", es6_reports.PublicItemUsageReport, _es6_count) - self._write_tabbed( - "es8", - es8_metrics.PublicItemUsageReportEs8, - _es8_count, - style=self._eq_style(_es8_count, _es6_count), - ) - self._write_tabbed( - "es6", es6_reports.PublicItemUsageReport, "(items)", _es6_item_count - ) - self._write_tabbed( - "es8", - es8_metrics.PublicItemUsageReportEs8, - "(items)", - _es8_item_count, - style=self._eq_style(_es8_item_count, _es6_item_count), - ) + migrate_counted_usages.delay(_from_str, _until_str) + migrate_preprint_views.delay(_from_str, _until_str) + migrate_preprint_downloads.delay(_from_str, _until_str) + + def _handle_usage_reports(self, *, start: bool, no_counts: bool): + if not no_counts: + # display counts of reports and distinct items + _es6_count, _es6_item_count = _es6_usage_report_counts() + _es8_count, _es8_item_count = _es8_usage_report_counts() + self._write_tabbed("es6", es6_reports.PublicItemUsageReport, _es6_count) + self._write_tabbed( + "es8", + es8_metrics.PublicItemUsageReportEs8, + _es8_count, + style=self._eq_style(_es8_count, _es6_count), + ) + self._write_tabbed( + "es6", es6_reports.PublicItemUsageReport, "osfid count:", _es6_item_count + ) + self._write_tabbed( + "es8", + es8_metrics.PublicItemUsageReportEs8, + "(items)", + _es8_item_count, + style=self._eq_style(_es8_item_count, _es6_item_count), + ) # (if --start) schedule task per item (by composite agg on es6 public usage reports) # each item-task iter thru reports oldest to newest, adding cumulative counts if start: self.stdout.write( - f"starting per-item {es6_reports.PublicItemUsageReport} => {es8_metrics.PublicItemUsageReportEs8}" + f"starting per-item {es6_reports.PublicItemUsageReport.__name__} => {es8_metrics.PublicItemUsageReportEs8.__name__}" ) for _osfid in _each_usage_report_osfid( started_at=self._migration_started_at ): - migrate_usage_reports(_osfid) - # TODO: migrate_usage_reports.delay(...) - self.stdout.write("---") + migrate_usage_reports.delay(_osfid) @functools.cached_property def _migration_started_at(self): @@ -757,8 +720,8 @@ def _check_started_at(self, start_now): f"osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}" ) elif start_now: - del self._migration_started_at # clear cache _started_at = es8_metrics.Elastic6To8State.set_started_at_now() + del self._migration_started_at # clear cache self.stdout.write( f"osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}" ) @@ -766,7 +729,6 @@ def _check_started_at(self, start_now): self.stdout.write( "osf.metrics 6->8 migration not started nor starting (run with `--start` to start)" ) - self.stdout.write("---") def _clear_state(self): self.stdout.write( diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 67fee676112..2f4023105d8 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -162,7 +162,7 @@ class RegistriesModerationMetricsEs8(djelme.EventRecord): from_state: str to_state: str user_id: str - comment: str + comment: str | None class Index: settings = { @@ -171,6 +171,9 @@ class Index: 'refresh_interval': '1s', } + class Meta: + timeseries_recordtype_name = 'RegistriesModerationMetrics' + ### # Reusable inner objects for reports @@ -235,6 +238,7 @@ class StorageAddonUsageEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'StorageAddonUsage' class DownloadCountReportEs8(djelme.CyclicRecord): @@ -244,6 +248,7 @@ class DownloadCountReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'DownloadCountReport' class InstitutionSummaryReportEs8(djelme.CyclicRecord): @@ -260,6 +265,7 @@ class InstitutionSummaryReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = MONTHLY + timeseries_recordtype_name = 'InstitutionSummaryReport' class NewUserDomainReportEs8(djelme.CyclicRecord): @@ -271,6 +277,7 @@ class NewUserDomainReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = MONTHLY + timeseries_recordtype_name = 'NewUserDomainReport' class NodeSummaryReportEs8(djelme.CyclicRecord): @@ -283,6 +290,7 @@ class NodeSummaryReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'NodeSummaryReport' class OsfstorageFileCountReportEs8(djelme.CyclicRecord): @@ -292,6 +300,7 @@ class OsfstorageFileCountReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'OsfstorageFileCountReport' class PreprintSummaryReportEs8(djelme.CyclicRecord): @@ -303,6 +312,7 @@ class PreprintSummaryReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = MONTHLY + timeseries_recordtype_name = 'PreprintSummaryReport' class UserSummaryReportEs8(djelme.CyclicRecord): @@ -317,6 +327,7 @@ class UserSummaryReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'UserSummaryReport' class SpamSummaryReportEs8(djelme.CyclicRecord): @@ -336,6 +347,7 @@ class SpamSummaryReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'SpamSummaryReport' class InstitutionalUserReportEs8(djelme.CyclicRecord): @@ -350,7 +362,7 @@ class InstitutionalUserReportEs8(djelme.CyclicRecord): month_last_login = YearmonthField() month_last_active = YearmonthField() account_creation_date = YearmonthField() - orcid_id: str + orcid_id: str | None # counts: public_project_count: int private_project_count: int @@ -362,6 +374,7 @@ class InstitutionalUserReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = MONTHLY + timeseries_recordtype_name = 'InstitutionalUserReport' class InstitutionMonthlySummaryReportEs8(djelme.CyclicRecord): @@ -382,6 +395,7 @@ class InstitutionMonthlySummaryReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'InstitutionMonthlySummaryReport' class PublicItemUsageReportEs8(djelme.CyclicRecord): @@ -410,6 +424,7 @@ class PublicItemUsageReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = MONTHLY + timeseries_recordtype_name = 'PublicItemUsageReport' class PrivateSpamMetricsReportEs8(djelme.CyclicRecord): @@ -426,6 +441,7 @@ class PrivateSpamMetricsReportEs8(djelme.CyclicRecord): class Meta: timeseries_index_timedepth = YEARLY + timeseries_recordtype_name = 'PrivateSpamMetricsReport' ### @@ -440,9 +456,6 @@ class Elastic6To8State(djelme.SimpleRecord): default_factory=lambda: datetime.datetime.now(datetime.UTC), ) - class Index: - name = 'osf_elastic6to8state' - @classmethod def get_by_key(cls, key: str): _response = cls.search().query({'term': {'key': key}})[0].execute() @@ -460,4 +473,5 @@ def get_started_at(cls): @classmethod def set_started_at_now(cls): _record = cls.record(key='started_at') + cls.refresh() return _record.timestamp diff --git a/website/settings/defaults.py b/website/settings/defaults.py index dc69126ca37..2d174472576 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -608,6 +608,7 @@ class CeleryConfig: 'scripts.remove_after_use.merge_notification_subscription_provider_ct', 'scripts.disable_removed_beat_tasks', 'osf.management.commands.delete_withdrawn_or_failed_registration_files', + 'osf.management.commands.migrate_osfmetrics_6to8', ) # Modules that need metrics and release requirements From 95b42e600b11fb2b4f8e51dd6e80b53756575bab Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 12:48:39 -0400 Subject: [PATCH 054/100] wip --- docker-compose.yml | 5 + .../commands/migrate_osfmetrics_6to8.py | 147 +++++++++--------- poetry.lock | 6 +- pyproject.toml | 2 +- 4 files changed, 84 insertions(+), 76 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index d771c75797a..04d64c51fda 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -78,6 +78,11 @@ services: - 9201:9200 volumes: - elasticsearch6_data_vol:/usr/share/elasticsearch/data + healthcheck: + start_period: 15s + test: curl -s http://localhost:9200/_cluster/health | grep -vq '"status":"red"' + interval: 10s + retries: 30 stdin_open: true elasticsearch8: diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index 5ee937e80c0..5ce383b99bc 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -2,7 +2,6 @@ import datetime import functools import logging -from pprint import pprint from django.core.management import call_command from django.core.management.base import BaseCommand @@ -11,7 +10,6 @@ from elasticsearch6 import helpers as es6_helpers from elasticsearch6_dsl.connections import connections as es6_connections from elasticsearch8.exceptions import ConnectionError as Elastic8ConnectionError -from elasticsearch8.dsl.connections import connections as es8_connections from elasticsearch_metrics.registry import djelme_registry from elasticsearch_metrics.imps import elastic8 as djel8me from psycopg2 import OperationalError as PostgresOperationalError @@ -73,10 +71,9 @@ @celery_app.task(**_TASK_KWARGS) -def migrate_unchanged_recordtype(es6_recordtype_name: str): +def migrate_unchanged_recordtype(es6_recordtype_name: str, until_when: str): _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] - _assert_field_unchangedness(_es6_recordtype, _es8_recordtype) _convert_kwargs = ( _convert_unchanged_cyclicrecord_kwargs if issubclass(_es8_recordtype, djel8me.CyclicRecord) @@ -84,7 +81,7 @@ def migrate_unchanged_recordtype(es6_recordtype_name: str): ) _each_new = ( _es8_recordtype(**_convert_kwargs(_hit["_source"])) - for _hit in _es6_scan_all(_es6_recordtype) + for _hit in _es6_scan_range(_es6_recordtype, until_when=until_when) ) return _es8_bulk_save(_es8_recordtype, _each_new) @@ -96,8 +93,8 @@ def migrate_counted_usages(from_when: str, until_when: str): _convert_counted_usage(_hit["_source"]) for _hit in _es6_scan_range( CountedUsageEs6, - from_when, - until_when, + from_when=from_when, + until_when=until_when, addl_filter={"exists": {"field": "item_guid"}}, ) ) @@ -110,7 +107,9 @@ def migrate_preprint_views(from_when: str, until_when: str): _action_labels = ["view", "web"] _each_new = ( _convert_preprint_metric(_hit["_source"], _action_labels) - for _hit in _es6_scan_range(PreprintView, from_when, until_when) + for _hit in _es6_scan_range( + PreprintView, from_when=from_when, until_when=until_when + ) ) return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @@ -121,23 +120,24 @@ def migrate_preprint_downloads(from_when: str, until_when: str): _action_labels = ["download"] _each_new = ( _convert_preprint_metric(_hit["_source"], _action_labels) - for _hit in _es6_scan_range(PreprintDownload, from_when, until_when) + for _hit in _es6_scan_range( + PreprintDownload, from_when=from_when, until_when=until_when + ) ) return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @celery_app.task(**_TASK_KWARGS) -def migrate_usage_reports(osfid: str): +def migrate_usage_reports(osfid: str, until_when: str): # from PublicItemUsageReport to PublicItemUsageReportEs8 def _each_new(): # go in sorted order to build cumulative counts # (only a few dozen of these per item; should be fine to sort and load all at once) - _each_hit = _es6_scan_all( + _each_hit = _es6_scan_range( es6_reports.PublicItemUsageReport, - query_body={ - "query": {"term": {"item_osfid": osfid}}, - "sort": "report_yearmonth", - }, + until_when=until_when, + addl_filter={"term": {"item_osfid": osfid}}, + sort="report_yearmonth", ) _prior_report = None for _hit in list(_each_hit): @@ -158,31 +158,6 @@ def _es6_connection(): return es6_connections.get_connection("osfmetrics_es6") -def _es8_connection(): - return es8_connections.get_connection("osfmetrics_es8") - - -def _delete_all(recordtype): - # TODO: REMOVE THIS - recordtype.search().query({"match_all": {}}).delete() - recordtype.refresh() - - -def _delete_all_es8(): - # TODO: REMOVE THIS - for _es8_recordtype in _UNCHANGED_RECORDTYPES.values(): - _delete_all(_es8_recordtype) - _delete_all(es8_metrics.PublicItemUsageReportEs8) - _delete_all(es8_metrics.OsfCountedUsageRecord) - - -def _debug_migrate(each_new): - # TODO: remove this - for _each in each_new: - _each.full_clean() - pprint(_each.to_dict(include_meta=True)) - - def _es8_bulk_save(es8_recordtype, each_new_record): _success_count, _fail_count = es8_recordtype.bulk( each_new_record, @@ -203,24 +178,29 @@ def _date_range( (_from_date, _until_date) = (_until_date, _until_date + step) -def _es6_scan_all(es6_recordtype, query_body=None): - return es6_helpers.scan( - _es6_connection(), - index=es6_recordtype._template_pattern, - query=query_body, - ) - - -def _es6_scan_range(es6_recordtype, from_when: str, until_when: str, addl_filter=None): +def _es6_scan_range( + es6_recordtype, + *, + from_when: str = "", + until_when: str, + addl_filter=None, + sort=None, +): + _timestamp_range = {"lt": until_when} + if from_when: + _timestamp_range["gte"] = from_when _filters = [ - {"range": {"timestamp": {"gte": from_when, "lt": until_when}}}, + {"range": {"timestamp": _timestamp_range}}, ] if addl_filter: _filters.append(addl_filter) + _query_body = {"query": {"bool": {"filter": _filters}}} + if sort: + _query_body["sort"] = sort return es6_helpers.scan( _es6_connection(), index=es6_recordtype._template_pattern, - query={"query": {"bool": {"filter": _filters}}}, + query=_query_body, ) @@ -332,7 +312,9 @@ def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageRecord: timestamp=source["timestamp"], sessionhour_id=source["session_id"], platform_iri=source.get("platform_iri") or website_settings.DOMAIN, - database_iri=_convert_database_iri(source.get("provider_id"), source.get("item_type")), + database_iri=_convert_database_iri( + source.get("provider_id"), source.get("item_type") + ), item_iri=_item_iri, within_iris=[ _iri_from_osfid(_within_osfid) @@ -345,7 +327,6 @@ def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageRecord: provider_id=source.get("provider_id"), user_is_authenticated=source.get("user_is_authenticated"), action_labels=source.get("action_labels"), - # TODO: does this need the PageviewInfo object or is the dictionary fine? pageview_info=source.get("pageview_info"), ) @@ -550,10 +531,10 @@ def _fallback_iri(): return _fallback_iri() -def _each_usage_report_osfid(started_at, after_osfid=None): +def _each_usage_report_osfid(until_when, after_osfid=None): _search = ( es6_reports.PublicItemUsageReport.search() - .filter("range", timestamp={"lt": started_at}) + .filter("range", timestamp={"lt": until_when}) .extra(size=0) ) _search.aggs.bucket( @@ -600,6 +581,10 @@ def add_arguments(self, parser): action="store_true", ) + @functools.cached_property + def _migration_started_at(self): + return es8_metrics.Elastic6To8State.get_started_at() + def handle( self, *, @@ -625,10 +610,13 @@ def handle( self._handle_usage_events(start=start, no_counts=no_counts) if usage_reports or _default_all: self._handle_usage_reports(start=start, no_counts=no_counts) + if not no_counts: + self.stdout.write("(counts may be approximate)") def _handle_unchanged(self, *, start: bool, no_counts: bool): # for each (unchanged) report/event: for _es6_cls, _es8_cls in _UNCHANGED_RECORDTYPES.items(): + _assert_field_unchangedness(_es6_cls, _es8_cls) if not no_counts: # display counts _es6_count = _es6_cls.search().count() @@ -641,26 +629,41 @@ def _handle_unchanged(self, *, start: bool, no_counts: bool): style=self._eq_style(_es8_count, _es6_count), ) if start: # schedule task - self.stdout.write(f"starting {_es6_cls.__name__} => {_es8_cls.__name__}") - migrate_unchanged_recordtype.delay(_es6_cls.__name__) + self.stdout.write( + f"starting {_es6_cls.__name__} => {_es8_cls.__name__}" + ) + migrate_unchanged_recordtype.delay( + _es6_cls.__name__, self._migration_started_at.isoformat() + ) def _handle_usage_events(self, *, start: bool, no_counts: bool): # for counted-usage events: - _started = self._migration_started_at + _started = self._migration_started_at or datetime.datetime.now() _range_start = (_started - datetime.timedelta(days=_USAGE_DAYS_BACK)).date() _range_end = _started.date() + datetime.timedelta(days=1) if not no_counts: # display counts for each view/download event type - _range_q = {"range": {"timestamp": {"gte": _range_start.isoformat(), "lt": _range_end.isoformat()}}} + _range_q = { + "range": { + "timestamp": { + "gte": _range_start.isoformat(), + "lt": _range_end.isoformat(), + } + } + } _es6_pview_count = PreprintView.search().filter(_range_q).count() _es6_pdownload_count = PreprintDownload.search().filter(_range_q).count() _es6_usage_event_count = CountedUsageEs6.search().filter(_range_q).count() - _es6_count = _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count + _es6_count = ( + _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count + ) _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() self._write_tabbed("es6", PreprintView, _es6_pview_count) self._write_tabbed("es6", PreprintDownload, _es6_pdownload_count) self._write_tabbed("es6", CountedUsageEs6, _es6_usage_event_count) - self._write_tabbed("es6", f"(total between {_range_start} and {_range_end})", _es6_count) + self._write_tabbed( + "es6", f"(total between {_range_start} and {_range_end})", _es6_count + ) self._write_tabbed( "es8", es8_metrics.OsfCountedUsageRecord, @@ -668,7 +671,9 @@ def _handle_usage_events(self, *, start: bool, no_counts: bool): style=self._eq_style(_es8_count, _es6_count), ) if start: # schedule (per-day?) tasks (if --start) - self.stdout.write(f"starting usages => {es8_metrics.OsfCountedUsageRecord.__name__}") + self.stdout.write( + f"starting usages => {es8_metrics.OsfCountedUsageRecord.__name__}" + ) for _from_date, _until_date in _date_range(_range_start, _range_end): _from_str = _from_date.isoformat() _until_str = _until_date.isoformat() @@ -689,7 +694,10 @@ def _handle_usage_reports(self, *, start: bool, no_counts: bool): style=self._eq_style(_es8_count, _es6_count), ) self._write_tabbed( - "es6", es6_reports.PublicItemUsageReport, "osfid count:", _es6_item_count + "es6", + es6_reports.PublicItemUsageReport, + "osfid count:", + _es6_item_count, ) self._write_tabbed( "es8", @@ -705,13 +713,11 @@ def _handle_usage_reports(self, *, start: bool, no_counts: bool): f"starting per-item {es6_reports.PublicItemUsageReport.__name__} => {es8_metrics.PublicItemUsageReportEs8.__name__}" ) for _osfid in _each_usage_report_osfid( - started_at=self._migration_started_at + until_when=self._migration_started_at ): - migrate_usage_reports.delay(_osfid) - - @functools.cached_property - def _migration_started_at(self): - return es8_metrics.Elastic6To8State.get_started_at() + migrate_usage_reports.delay( + _osfid, self._migration_started_at.isoformat() + ) def _check_started_at(self, start_now): _started_at = self._migration_started_at @@ -736,9 +742,6 @@ def _clear_state(self): ) es8_metrics.Elastic6To8State.search().query({"match_all": {}}).delete() es8_metrics.Elastic6To8State.refresh() - # TODO: REMOVE THIS - self.stdout.write("deleting all migration target data in es8", self.style.ERROR) - _delete_all_es8() def _eq_style(self, num: int, should_be: int): return self.style.SUCCESS if (num == should_be) else self.style.WARNING diff --git a/poetry.lock b/poetry.lock index 7aee4eca49f..1aec6afa426 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "fed3c14f213642284a197ac2933106cdafede25b" -resolved_reference = "fed3c14f213642284a197ac2933106cdafede25b" +reference = "34c7b180e6d595b3374534cd50efb00f5a809582" +resolved_reference = "34c7b180e6d595b3374534cd50efb00f5a809582" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "0f9c547a6309aa915b25f9a7a98e5d0c15c867d577a883547d894ca173cb2344" +content-hash = "9edb43576b960885c14e32e9ae74218c28d883df48679868848dbaa5780c4b12" diff --git a/pyproject.toml b/pyproject.toml index b04e0540d90..815efdd61a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "fed3c14f213642284a197ac2933106cdafede25b"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "34c7b180e6d595b3374534cd50efb00f5a809582"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From bac21a0ba0d2973d5da7fdc1718cd13b5724f15c Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 13:23:15 -0400 Subject: [PATCH 055/100] chore: "fix' quotes --- .../commands/migrate_osfmetrics_6to8.py | 338 +++++++++--------- 1 file changed, 169 insertions(+), 169 deletions(-) diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index 5ce383b99bc..ccc15834644 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -72,7 +72,7 @@ @celery_app.task(**_TASK_KWARGS) def migrate_unchanged_recordtype(es6_recordtype_name: str, until_when: str): - _es6_recordtype = djelme_registry.get_recordtype("osf", es6_recordtype_name) + _es6_recordtype = djelme_registry.get_recordtype('osf', es6_recordtype_name) _es8_recordtype = _UNCHANGED_RECORDTYPES[_es6_recordtype] _convert_kwargs = ( _convert_unchanged_cyclicrecord_kwargs @@ -80,7 +80,7 @@ def migrate_unchanged_recordtype(es6_recordtype_name: str, until_when: str): else (lambda _kw: _kw) # no conversion needed for event record ) _each_new = ( - _es8_recordtype(**_convert_kwargs(_hit["_source"])) + _es8_recordtype(**_convert_kwargs(_hit['_source'])) for _hit in _es6_scan_range(_es6_recordtype, until_when=until_when) ) return _es8_bulk_save(_es8_recordtype, _each_new) @@ -90,12 +90,12 @@ def migrate_unchanged_recordtype(es6_recordtype_name: str, until_when: str): def migrate_counted_usages(from_when: str, until_when: str): # CountedAuthUsage => OsfCountedUsageRecord _each_new = ( - _convert_counted_usage(_hit["_source"]) + _convert_counted_usage(_hit['_source']) for _hit in _es6_scan_range( CountedUsageEs6, from_when=from_when, until_when=until_when, - addl_filter={"exists": {"field": "item_guid"}}, + addl_filter={'exists': {'field': 'item_guid'}}, ) ) return _es8_bulk_save(es8_metrics.OsfCountedUsageRecord, _each_new) @@ -104,9 +104,9 @@ def migrate_counted_usages(from_when: str, until_when: str): @celery_app.task(**_TASK_KWARGS) def migrate_preprint_views(from_when: str, until_when: str): # PreprintView => OsfCountedUsageRecord - _action_labels = ["view", "web"] + _action_labels = ['view', 'web'] _each_new = ( - _convert_preprint_metric(_hit["_source"], _action_labels) + _convert_preprint_metric(_hit['_source'], _action_labels) for _hit in _es6_scan_range( PreprintView, from_when=from_when, until_when=until_when ) @@ -117,9 +117,9 @@ def migrate_preprint_views(from_when: str, until_when: str): @celery_app.task(**_TASK_KWARGS) def migrate_preprint_downloads(from_when: str, until_when: str): # PreprintDownload => OsfCountedUsageRecord - _action_labels = ["download"] + _action_labels = ['download'] _each_new = ( - _convert_preprint_metric(_hit["_source"], _action_labels) + _convert_preprint_metric(_hit['_source'], _action_labels) for _hit in _es6_scan_range( PreprintDownload, from_when=from_when, until_when=until_when ) @@ -136,14 +136,14 @@ def _each_new(): _each_hit = _es6_scan_range( es6_reports.PublicItemUsageReport, until_when=until_when, - addl_filter={"term": {"item_osfid": osfid}}, - sort="report_yearmonth", + addl_filter={'term': {'item_osfid': osfid}}, + sort='report_yearmonth', ) _prior_report = None for _hit in list(_each_hit): yield ( _prior_report := _convert_public_usage_report( - _hit["_source"], _prior_report + _hit['_source'], _prior_report ) ) @@ -155,7 +155,7 @@ def _each_new(): def _es6_connection(): - return es6_connections.get_connection("osfmetrics_es6") + return es6_connections.get_connection('osfmetrics_es6') def _es8_bulk_save(es8_recordtype, each_new_record): @@ -181,22 +181,22 @@ def _date_range( def _es6_scan_range( es6_recordtype, *, - from_when: str = "", + from_when: str = '', until_when: str, addl_filter=None, sort=None, ): - _timestamp_range = {"lt": until_when} + _timestamp_range = {'lt': until_when} if from_when: - _timestamp_range["gte"] = from_when + _timestamp_range['gte'] = from_when _filters = [ - {"range": {"timestamp": _timestamp_range}}, + {'range': {'timestamp': _timestamp_range}}, ] if addl_filter: _filters.append(addl_filter) - _query_body = {"query": {"bool": {"filter": _filters}}} + _query_body = {'query': {'bool': {'filter': _filters}}} if sort: - _query_body["sort"] = sort + _query_body['sort'] = sort return es6_helpers.scan( _es6_connection(), index=es6_recordtype._template_pattern, @@ -207,16 +207,16 @@ def _es6_scan_range( def _es6_usage_report_counts() -> tuple[int, int]: _search = es6_reports.PublicItemUsageReport.search() _search.aggs.metric( - "agg_item_count", - "cardinality", - field="item_osfid", + 'agg_item_count', + 'cardinality', + field='item_osfid', precision_threshold=_MAX_CARDINALITY_PRECISION, ) _response = _search.execute() _total_count = _response.hits.total _item_count = ( _response.aggregations.agg_item_count.value - if "agg_item_count" in _response.aggregations + if 'agg_item_count' in _response.aggregations else 0 ) return (_total_count, _item_count) @@ -225,30 +225,30 @@ def _es6_usage_report_counts() -> tuple[int, int]: def _es8_usage_report_counts() -> tuple[int, int]: _search = es8_metrics.PublicItemUsageReportEs8.search() _search.aggs.metric( - "agg_item_count", - "cardinality", - field="item_osfid", + 'agg_item_count', + 'cardinality', + field='item_osfid', precision_threshold=_MAX_CARDINALITY_PRECISION, ) _response = _search.execute() _total_count = _response.hits.total.value _item_count = ( _response.aggregations.agg_item_count.value - if "agg_item_count" in _response.aggregations + if 'agg_item_count' in _response.aggregations else 0 ) return (_total_count, _item_count) def _get_es6_field_names(es6_recordtype): - """ + ''' adapted from DocumentBase._get_field_names in elasticsearch8.dsl - """ + ''' for _field_name in es6_recordtype._doc_type.mapping: _field = es6_recordtype._doc_type.mapping[_field_name] - if hasattr(_field, "_doc_class"): + if hasattr(_field, '_doc_class'): for _sub_field in _get_es6_field_names(_field._doc_class): - yield f"{_field_name}.{_sub_field}" + yield f'{_field_name}.{_sub_field}' else: yield _field_name @@ -260,20 +260,20 @@ def _assert_field_unchangedness(es6_recordtype, es8_recordtype): # remove fields intentionally removed in migration if issubclass(es6_recordtype, es6_reports.DailyReport): assert issubclass(es8_recordtype, djel8me.CyclicRecord) - _es6_fields.remove("timestamp") - _es6_fields.remove("report_date") + _es6_fields.remove('timestamp') + _es6_fields.remove('report_date') elif issubclass(es6_recordtype, es6_reports.MonthlyReport): assert issubclass(es8_recordtype, djel8me.CyclicRecord) - _es6_fields.remove("timestamp") - _es6_fields.remove("report_yearmonth") + _es6_fields.remove('timestamp') + _es6_fields.remove('report_yearmonth') else: assert issubclass(es8_recordtype, djel8me.EventRecord) # remove fields intentionally added in migration - _es8_fields.remove("timeseries_timeparts") + _es8_fields.remove('timeseries_timeparts') if issubclass(es8_recordtype, djel8me.CyclicRecord): - _es8_fields.remove("created") - _es8_fields.remove("cycle_coverage") + _es8_fields.remove('created') + _es8_fields.remove('cycle_coverage') # all remaining fields should match assert _es6_fields == _es8_fields @@ -281,24 +281,24 @@ def _assert_field_unchangedness(es6_recordtype, es8_recordtype): def _semverish_from_yearmonth(given_yearmonth: str): _ym = YearMonth.from_str(given_yearmonth) - return f"{_ym.year}.{_ym.month}" + return f'{_ym.year}.{_ym.month}' def _semverish_from_date(given_date: str): _d = datetime.date.fromisoformat(given_date) - return f"{_d.year}.{_d.month}.{_d.day}" + return f'{_d.year}.{_d.month}.{_d.day}' def _convert_unchanged_cyclicrecord_kwargs(es6_source: dict) -> dict: def _each_kwarg(): for _key, _val in es6_source.items(): - if _key == "report_yearmonth": + if _key == 'report_yearmonth': # report_yearmonth converts to cycle_coverage Y.M - yield ("cycle_coverage", _semverish_from_yearmonth(_val)) - elif _key == "report_date": + yield ('cycle_coverage', _semverish_from_yearmonth(_val)) + elif _key == 'report_date': # report_date converts to cycle_coverage Y.M.D - yield ("cycle_coverage", _semverish_from_date(_val)) - elif _key != "timestamp": + yield ('cycle_coverage', _semverish_from_date(_val)) + elif _key != 'timestamp': # skipping timestamp; on daily/monthly reports just copied from yearmonth/date yield (_key, _val) @@ -306,51 +306,51 @@ def _each_kwarg(): def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageRecord: - _item_iri = _iri_from_osfid(source["item_guid"]) + _item_iri = _iri_from_osfid(source['item_guid']) return es8_metrics.OsfCountedUsageRecord( # fields from djelme.CountedUsageRecord: - timestamp=source["timestamp"], - sessionhour_id=source["session_id"], - platform_iri=source.get("platform_iri") or website_settings.DOMAIN, + timestamp=source['timestamp'], + sessionhour_id=source['session_id'], + platform_iri=source.get('platform_iri') or website_settings.DOMAIN, database_iri=_convert_database_iri( - source.get("provider_id"), source.get("item_type") + source.get('provider_id'), source.get('item_type') ), item_iri=_item_iri, within_iris=[ _iri_from_osfid(_within_osfid) - for _within_osfid in source.get("surrounding_guids", ()) + for _within_osfid in source.get('surrounding_guids', ()) ], # fields from OsfCountedUsageRecord: - item_osfid=source["item_guid"], - item_type=source.get("item_type", "osf:Object"), - item_public=source.get("item_public"), - provider_id=source.get("provider_id"), - user_is_authenticated=source.get("user_is_authenticated"), - action_labels=source.get("action_labels"), - pageview_info=source.get("pageview_info"), + item_osfid=source['item_guid'], + item_type=source.get('item_type', 'osf:Object'), + item_public=source.get('item_public'), + provider_id=source.get('provider_id'), + user_is_authenticated=source.get('user_is_authenticated'), + action_labels=source.get('action_labels'), + pageview_info=source.get('pageview_info'), ) def _convert_preprint_metric( source: dict, action_labels: list[str] ) -> es8_metrics.OsfCountedUsageRecord: - _preprint_iri = _iri_from_osfid(source["preprint_id"]) + _preprint_iri = _iri_from_osfid(source['preprint_id']) return es8_metrics.OsfCountedUsageRecord.record( using=False, # don't save yet; will save in bulk # fields used to compute a sessionhour_id: - timestamp=source["timestamp"], - user_id=source.get("user_id"), + timestamp=source['timestamp'], + user_id=source.get('user_id'), # fields from djelme.CountedUsageRecord: platform_iri=website_settings.DOMAIN, - database_iri=_convert_database_iri(source.get("provider_id"), "preprint"), + database_iri=_convert_database_iri(source.get('provider_id'), 'preprint'), item_iri=_preprint_iri, within_iris=[_preprint_iri], # fields from OsfCountedUsageRecord: - item_osfid=source["preprint_id"], - item_type="preprint", + item_osfid=source['preprint_id'], + item_type='preprint', item_public=True, - provider_id=source.get("provider_id"), - user_is_authenticated=bool(source.get("user_id")), + provider_id=source.get('provider_id'), + user_is_authenticated=bool(source.get('user_id')), action_labels=action_labels, ) @@ -361,40 +361,40 @@ def _convert_public_usage_report( ) -> es8_metrics.PublicItemUsageReportEs8: if prior_report is None: _c_views, _c_view_sess, _c_downloads, _c_download_sess = _get_cumulative_usage( - osfid=source["item_osfid"], - until_when=YearMonth.from_str(source["report_yearmonth"]).month_end(), - item_type=source.get("item_type"), + osfid=source['item_osfid'], + until_when=YearMonth.from_str(source['report_yearmonth']).month_end(), + item_type=source.get('item_type'), ) else: - _c_views = prior_report.cumulative_view_count + source.get("view_count", 0) + _c_views = prior_report.cumulative_view_count + source.get('view_count', 0) _c_view_sess = prior_report.cumulative_view_session_count + source.get( - "view_session_count", 0 + 'view_session_count', 0 ) _c_downloads = prior_report.cumulative_download_count + source.get( - "download_count", 0 + 'download_count', 0 ) _c_download_sess = prior_report.cumulative_download_session_count + source.get( - "download_session_count", 0 + 'download_session_count', 0 ) return es8_metrics.PublicItemUsageReportEs8( - cycle_coverage=_semverish_from_yearmonth(source["report_yearmonth"]), - item_osfid=source["item_osfid"], - item_type=source.get("item_type"), - provider_id=source.get("provider_id"), - platform_iri=source.get("platform_iri") or website_settings.DOMAIN, - view_count=source.get("view_count"), - view_session_count=source.get("view_session_count"), + cycle_coverage=_semverish_from_yearmonth(source['report_yearmonth']), + item_osfid=source['item_osfid'], + item_type=source.get('item_type'), + provider_id=source.get('provider_id'), + platform_iri=source.get('platform_iri') or website_settings.DOMAIN, + view_count=source.get('view_count'), + view_session_count=source.get('view_session_count'), cumulative_view_count=_c_views, cumulative_view_session_count=_c_view_sess, - download_count=source.get("download_count"), - download_session_count=source.get("download_session_count"), + download_count=source.get('download_count'), + download_session_count=source.get('download_session_count'), cumulative_download_count=_c_downloads, cumulative_download_session_count=_c_download_sess, ) def _get_cumulative_usage(osfid: str, until_when, item_type: str | None): - if item_type == "preprint": + if item_type == 'preprint': _views = _cumulative_preprint_count(PreprintView, osfid, until_when) _downloads = _cumulative_preprint_count(PreprintDownload, osfid, until_when) _view_sess, _download_sess = 0, 0 # no session info on preprints (yet) @@ -407,90 +407,90 @@ def _get_cumulative_usage(osfid: str, until_when, item_type: str | None): def _cumulative_countedusage_views(osfid: str, until_when: str) -> tuple[int, int]: - """compute view_session_count separately to avoid double-counting + '''compute view_session_count separately to avoid double-counting (the same session may be represented in both the composite agg on `item_guid` and that on `surrounding_guids`) - """ + ''' # copied/adapted from osf.metrics.reporters.public_item_usage _search = ( CountedUsageEs6.search() - .filter("term", item_public=True) - .filter("range", timestamp={"lt": until_when}) - .filter("term", action_labels="view") + .filter('term', item_public=True) + .filter('range', timestamp={'lt': until_when}) + .filter('term', action_labels='view') .filter( - "bool", + 'bool', should=[ - {"term": {"item_guid": osfid}}, - {"term": {"surrounding_guids": osfid}}, + {'term': {'item_guid': osfid}}, + {'term': {'surrounding_guids': osfid}}, ], minimum_should_match=1, ) .extra(size=0) # only aggregations, no hits ) _search.aggs.metric( - "agg_session_count", - "cardinality", - field="session_id", + 'agg_session_count', + 'cardinality', + field='session_id', precision_threshold=_MAX_CARDINALITY_PRECISION, ) _response = _search.execute() _view_count = _response.hits.total _view_session_count = ( _response.aggregations.agg_session_count.value - if "agg_session_count" in _response.aggregations + if 'agg_session_count' in _response.aggregations else 0 ) return (_view_count, _view_session_count) def _cumulative_countedusage_downloads(osfid, until_when) -> tuple[int, int]: - """aggregate downloads on each osfid (not including components/files)""" + '''aggregate downloads on each osfid (not including components/files)''' # copied/adapted from osf.metrics.reporters.public_item_usage _search = ( CountedUsageEs6.search() - .filter("term", item_public=True) - .filter("range", timestamp={"lt": until_when}) - .filter("term", action_labels="download") - .filter("term", item_guid=osfid) + .filter('term', item_public=True) + .filter('range', timestamp={'lt': until_when}) + .filter('term', action_labels='download') + .filter('term', item_guid=osfid) ) _search.aggs.metric( - "agg_session_count", - "cardinality", - field="session_id", + 'agg_session_count', + 'cardinality', + field='session_id', precision_threshold=_MAX_CARDINALITY_PRECISION, ) _response = _search.execute() _download_count = _response.hits.total _download_session_count = ( _response.aggregations.agg_session_count.value - if "agg_session_count" in _response.aggregations + if 'agg_session_count' in _response.aggregations else 0 ) return (_download_count, _download_session_count) def _cumulative_preprint_count(preprint_metric_cls, osfid: str, until_when: str) -> int: - """aggregate views on each preprint""" + '''aggregate views on each preprint''' # copied/adapted from osf.metrics.preprint_metrics _search = ( preprint_metric_cls.search() - .filter("term", preprint_id=osfid) - .filter("range", timestamp={"lt": until_when}) + .filter('term', preprint_id=osfid) + .filter('range', timestamp={'lt': until_when}) .extra(size=0) # no hits; only aggs ) - _search.aggs.metric("agg_count", "sum", field="count") + _search.aggs.metric('agg_count', 'sum', field='count') _response = _search.execute() _view_count = ( int(_response.aggregations.agg_count.value) - if hasattr(_response.aggregations, "agg_count") + if hasattr(_response.aggregations, 'agg_count') else 0 ) return _view_count def _iri_from_osfid(osfid: str) -> str: - return f"{website_settings.DOMAIN}{osfid}" + return f'{website_settings.DOMAIN}{osfid}' @functools.lru_cache @@ -499,34 +499,34 @@ def _convert_database_iri(provider_id: str | None, item_type: str) -> str: return website_settings.DOMAIN # osf is a provider, sure why not def _fallback_iri(): - return f"urn:osf.io:{provider_id}" + return f'urn:osf.io:{provider_id}' match item_type: # lower-cased osf.models class names - case "node" | "osfuser": - # implicit "osf" provider + case 'node' | 'osfuser': + # implicit 'osf' provider return website_settings.DOMAIN - case "preprint": + case 'preprint': try: _provider = osfdb.PreprintProvider.objects.get(_id=provider_id) except osfdb.PreprintProvider.DoesNotExist: - _logger.error(f"unknown preprint provider {provider_id!r}") + _logger.error(f'unknown preprint provider {provider_id!r}') return _fallback_iri() else: return _provider.get_semantic_iri() - case "registration": + case 'registration': try: _provider = osfdb.RegistrationProvider.objects.get(_id=provider_id) except osfdb.RegistrationProvider.DoesNotExist: - _logger.error(f"unknown registration provider {provider_id!r}") + _logger.error(f'unknown registration provider {provider_id!r}') return _fallback_iri() else: return _provider.get_semantic_iri() - case _ if "file" in item_type: + case _ if 'file' in item_type: # file providers are a different thing that don't really have an iri, just an id - return f"urn:files.osf.io:{provider_id}" + return f'urn:files.osf.io:{provider_id}' case _: # give up gracefully _logger.error( - f"unknown item type {item_type!r} with provider {provider_id!r}" + f'unknown item type {item_type!r} with provider {provider_id!r}' ) return _fallback_iri() @@ -534,16 +534,16 @@ def _fallback_iri(): def _each_usage_report_osfid(until_when, after_osfid=None): _search = ( es6_reports.PublicItemUsageReport.search() - .filter("range", timestamp={"lt": until_when}) + .filter('range', timestamp={'lt': until_when}) .extra(size=0) ) _search.aggs.bucket( - "agg_osfid", - "composite", - sources=[{"osfid": {"terms": {"field": "item_osfid"}}}], + 'agg_osfid', + 'composite', + sources=[{'osfid': {'terms': {'field': 'item_osfid'}}}], size=500, ) - return _iter_composite_bucket_keys(_search, "agg_osfid", "osfid", after=after_osfid) + return _iter_composite_bucket_keys(_search, 'agg_osfid', 'osfid', after=after_osfid) ### @@ -553,32 +553,32 @@ def _each_usage_report_osfid(until_when, after_osfid=None): class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument( - "--no-setup", - action="store_true", + '--no-setup', + action='store_true', ) parser.add_argument( - "--no-counts", - action="store_true", + '--no-counts', + action='store_true', ) parser.add_argument( - "--clear-state", - action="store_true", + '--clear-state', + action='store_true', ) parser.add_argument( - "--start", - action="store_true", + '--start', + action='store_true', ) parser.add_argument( - "--unchanged", - action="store_true", + '--unchanged', + action='store_true', ) parser.add_argument( - "--usage-events", - action="store_true", + '--usage-events', + action='store_true', ) parser.add_argument( - "--usage-reports", - action="store_true", + '--usage-reports', + action='store_true', ) @functools.cached_property @@ -599,7 +599,7 @@ def handle( ): self._quiet_chatty_loggers() if not no_setup: - call_command("djelme_backend_setup") + call_command('djelme_backend_setup') if clear_state: self._clear_state() self._check_started_at(start_now=start) @@ -611,7 +611,7 @@ def handle( if usage_reports or _default_all: self._handle_usage_reports(start=start, no_counts=no_counts) if not no_counts: - self.stdout.write("(counts may be approximate)") + self.stdout.write('(counts may be approximate)') def _handle_unchanged(self, *, start: bool, no_counts: bool): # for each (unchanged) report/event: @@ -621,16 +621,16 @@ def _handle_unchanged(self, *, start: bool, no_counts: bool): # display counts _es6_count = _es6_cls.search().count() _es8_count = _es8_cls.search().count() - self._write_tabbed("es6", _es6_cls, _es6_count) + self._write_tabbed('es6', _es6_cls, _es6_count) self._write_tabbed( - "es8", + 'es8', _es8_cls, _es8_count, style=self._eq_style(_es8_count, _es6_count), ) if start: # schedule task self.stdout.write( - f"starting {_es6_cls.__name__} => {_es8_cls.__name__}" + f'starting {_es6_cls.__name__} => {_es8_cls.__name__}' ) migrate_unchanged_recordtype.delay( _es6_cls.__name__, self._migration_started_at.isoformat() @@ -644,10 +644,10 @@ def _handle_usage_events(self, *, start: bool, no_counts: bool): if not no_counts: # display counts for each view/download event type _range_q = { - "range": { - "timestamp": { - "gte": _range_start.isoformat(), - "lt": _range_end.isoformat(), + 'range': { + 'timestamp': { + 'gte': _range_start.isoformat(), + 'lt': _range_end.isoformat(), } } } @@ -658,21 +658,21 @@ def _handle_usage_events(self, *, start: bool, no_counts: bool): _es6_pview_count + _es6_pdownload_count + _es6_usage_event_count ) _es8_count = es8_metrics.OsfCountedUsageRecord.search().count() - self._write_tabbed("es6", PreprintView, _es6_pview_count) - self._write_tabbed("es6", PreprintDownload, _es6_pdownload_count) - self._write_tabbed("es6", CountedUsageEs6, _es6_usage_event_count) + self._write_tabbed('es6', PreprintView, _es6_pview_count) + self._write_tabbed('es6', PreprintDownload, _es6_pdownload_count) + self._write_tabbed('es6', CountedUsageEs6, _es6_usage_event_count) self._write_tabbed( - "es6", f"(total between {_range_start} and {_range_end})", _es6_count + 'es6', f'(total between {_range_start} and {_range_end})', _es6_count ) self._write_tabbed( - "es8", + 'es8', es8_metrics.OsfCountedUsageRecord, _es8_count, style=self._eq_style(_es8_count, _es6_count), ) if start: # schedule (per-day?) tasks (if --start) self.stdout.write( - f"starting usages => {es8_metrics.OsfCountedUsageRecord.__name__}" + f'starting usages => {es8_metrics.OsfCountedUsageRecord.__name__}' ) for _from_date, _until_date in _date_range(_range_start, _range_end): _from_str = _from_date.isoformat() @@ -686,23 +686,23 @@ def _handle_usage_reports(self, *, start: bool, no_counts: bool): # display counts of reports and distinct items _es6_count, _es6_item_count = _es6_usage_report_counts() _es8_count, _es8_item_count = _es8_usage_report_counts() - self._write_tabbed("es6", es6_reports.PublicItemUsageReport, _es6_count) + self._write_tabbed('es6', es6_reports.PublicItemUsageReport, _es6_count) self._write_tabbed( - "es8", + 'es8', es8_metrics.PublicItemUsageReportEs8, _es8_count, style=self._eq_style(_es8_count, _es6_count), ) self._write_tabbed( - "es6", + 'es6', es6_reports.PublicItemUsageReport, - "osfid count:", + 'osfid count:', _es6_item_count, ) self._write_tabbed( - "es8", + 'es8', es8_metrics.PublicItemUsageReportEs8, - "(items)", + '(items)', _es8_item_count, style=self._eq_style(_es8_item_count, _es6_item_count), ) @@ -710,7 +710,7 @@ def _handle_usage_reports(self, *, start: bool, no_counts: bool): # each item-task iter thru reports oldest to newest, adding cumulative counts if start: self.stdout.write( - f"starting per-item {es6_reports.PublicItemUsageReport.__name__} => {es8_metrics.PublicItemUsageReportEs8.__name__}" + f'starting per-item {es6_reports.PublicItemUsageReport.__name__} => {es8_metrics.PublicItemUsageReportEs8.__name__}' ) for _osfid in _each_usage_report_osfid( until_when=self._migration_started_at @@ -723,24 +723,24 @@ def _check_started_at(self, start_now): _started_at = self._migration_started_at if _started_at: self.stdout.write( - f"osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}" + f'osf.metrics 6->8 migration started previously, at {_started_at.isoformat()}' ) elif start_now: _started_at = es8_metrics.Elastic6To8State.set_started_at_now() del self._migration_started_at # clear cache self.stdout.write( - f"osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}" + f'osf.metrics 6->8 migration starting now, at {_started_at.isoformat()}' ) else: self.stdout.write( - "osf.metrics 6->8 migration not started nor starting (run with `--start` to start)" + 'osf.metrics 6->8 migration not started nor starting (run with `--start` to start)' ) def _clear_state(self): self.stdout.write( - "clearing all migration state (start time, etc)", self.style.NOTICE + 'clearing all migration state (start time, etc)', self.style.NOTICE ) - es8_metrics.Elastic6To8State.search().query({"match_all": {}}).delete() + es8_metrics.Elastic6To8State.search().query({'match_all': {}}).delete() es8_metrics.Elastic6To8State.refresh() def _eq_style(self, num: int, should_be: int): @@ -752,13 +752,13 @@ def _to_str(strable): return strable.__name__ return str(strable) - self.stdout.write("\t".join(map(_to_str, strables)), style) + self.stdout.write('\t'.join(map(_to_str, strables)), style) def _quiet_chatty_loggers(self): _chatty_loggers = [ - "elasticsearch", - "elastic_transport", - "elasticsearch_metrics", + 'elasticsearch', + 'elastic_transport', + 'elasticsearch_metrics', ] for logger_name in _chatty_loggers: logging.getLogger(logger_name).setLevel(logging.ERROR) From 999dc869cd5bf0cd558f8cb2f0795e2a504e3427 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 13:43:29 -0400 Subject: [PATCH 056/100] fix: background migration task module --- website/settings/defaults.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/settings/defaults.py b/website/settings/defaults.py index 2d174472576..69f82d2d2a7 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -489,7 +489,7 @@ class CeleryConfig: } background_migration_modules = { - 'osf.management.commands.metrics_es8_migration', + 'osf.management.commands.migrate_osfmetrics_6to8', } try: From d9f5380aa7a1556a535b136e564f7c8e61d1fdc3 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 15:08:18 -0400 Subject: [PATCH 057/100] fix: timestamp tz handling --- osf/metrics/es8_metrics.py | 2 +- osf_tests/metrics/test_es8_metrics.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 2f4023105d8..3b83103b197 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -133,7 +133,7 @@ def _get_unique_together_values(self): self.timestamp.year, self.timestamp.month, self.timestamp.day, - tzinfo=datetime.UTC, + tzinfo=self.timestamp.tzinfo, ) time_in_seconds = (self.timestamp - day_start).total_seconds() time_window = int(time_in_seconds / 30) # 30-second windows diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py index e93579628dc..e9dd140b60a 100644 --- a/osf_tests/metrics/test_es8_metrics.py +++ b/osf_tests/metrics/test_es8_metrics.py @@ -1,4 +1,4 @@ -from datetime import datetime +import datetime from elasticsearch_metrics.tests.util import djelme_test_backends import pytest @@ -20,7 +20,7 @@ def _real_elastic(self): def test_nested_pageview_autofill(self): usage = OsfCountedUsageRecord.record( - timestamp=datetime(2024, 1, 1, 15, 0), + timestamp=datetime.datetime(2024, 1, 1, 15, 0, tzinfo=datetime.UTC), sessionhour_id='blah', database_iri='https://osf.example/provider', item_iri='https://osf.example/itemm', From beb85485f6f06df8abdd98a703c3b31e139e0d98 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 15:27:03 -0400 Subject: [PATCH 058/100] fix: tests with djelme --- osf_tests/metrics/test_es8_metrics.py | 41 ++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py index e9dd140b60a..ce562a026b4 100644 --- a/osf_tests/metrics/test_es8_metrics.py +++ b/osf_tests/metrics/test_es8_metrics.py @@ -39,13 +39,52 @@ def test_nested_pageview_autofill(self): assert usage.pageview_info.page_path == '/path/test' assert usage.pageview_info.referer_domain == 'google.com' assert usage.pageview_info.hour_of_day == 15 + assert usage.item_iri in usage.within_iris + + def test_nested_pageview_autofill_dict(self): + usage = OsfCountedUsageRecord.record( + timestamp=datetime.datetime(2024, 1, 1, 15, 0, tzinfo=datetime.UTC), + sessionhour_id='blah', + database_iri='https://osf.example/provider', + item_iri='https://osf.example/itemm', + item_osfid='itemm', + item_public=True, + item_type='https://osf.example/Preprint', + platform_iri='https://osf.example', + user_is_authenticated=False, + pageview_info={ + 'page_url': 'https://example.com/path/test', + 'referer_url': 'https://google.com', + 'route_name': 'foo.bar', + 'page_title': 'title title', + }, + ) + assert usage.pageview_info.page_path == '/path/test' + assert usage.pageview_info.referer_domain == 'google.com' + assert usage.pageview_info.hour_of_day == 15 + assert usage.item_iri in usage.within_iris + + def test_none_pageview_nested_autofill(self): + usage = OsfCountedUsageRecord.record( + timestamp=datetime.datetime(2024, 1, 1, 15, 0, tzinfo=datetime.UTC), + sessionhour_id='blah', + database_iri='https://osf.example/provider', + item_iri='https://osf.example/itemm', + item_osfid='itemm', + item_public=True, + item_type='https://osf.example/Preprint', + platform_iri='https://osf.example', + user_is_authenticated=False, + ) + assert usage.pageview_info is None + assert usage.item_iri in usage.within_iris def test_save_report(self): _saved = DownloadCountReportEs8.record( cycle_coverage='2026.1.1', daily_file_downloads=17, ) - DownloadCountReportEs8.refresh_timeseries_indexes() + DownloadCountReportEs8.refresh() _response = DownloadCountReportEs8.search().execute() (_fetched,) = _response assert _fetched.meta.id == _saved.meta.id From 778f4b435627a08a7d5f475a578c6be0d37e5cb2 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 15:48:40 -0400 Subject: [PATCH 059/100] fix: pageview_info optional --- osf/metrics/es8_metrics.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 3b83103b197..4c46710748c 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -57,19 +57,19 @@ class PageviewInfo(esdsl.InnerDoc): """ # fields that should be provided - referer_url: str - page_url: str - page_title: str - route_name: str = esdsl.mapped_field(esdsl.Keyword( + referer_url: str | None + page_url: str | None + page_title: str | None + route_name: str | None = esdsl.mapped_field(esdsl.Keyword( fields={ 'by_prefix': esdsl.Text(analyzer=route_prefix_analyzer), }, )) # fields auto-filled - page_path: str - referer_domain: str - hour_of_day: int + page_path: str | None + referer_domain: str | None + hour_of_day: int | None ### @@ -111,7 +111,9 @@ def clean(self): if _ref_url: self.pageview_info.referer_domain = urlsplit(_ref_url).netloc # ensure inclusive "within" - if self.item_iri not in self.within_iris: + if not self.within_iris: + self.within_iris = [self.item_iri] + elif self.item_iri not in self.within_iris: self.within_iris = [self.item_iri, *self.within_iris] def _get_unique_together_values(self): From ee913841430543a12f89c16a1aba3e40bb1e280b Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 16:59:53 -0400 Subject: [PATCH 060/100] fix: tests --- osf/management/commands/migrate_osfmetrics_6to8.py | 2 ++ osf_tests/metrics/test_es8_metrics.py | 2 +- poetry.lock | 6 +++--- pyproject.toml | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index ccc15834644..04afa94b6b9 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -2,6 +2,7 @@ import datetime import functools import logging +import uuid from django.core.management import call_command from django.core.management.base import BaseCommand @@ -340,6 +341,7 @@ def _convert_preprint_metric( # fields used to compute a sessionhour_id: timestamp=source['timestamp'], user_id=source.get('user_id'), + client_session_id=str(uuid.uuid4()), # fields from djelme.CountedUsageRecord: platform_iri=website_settings.DOMAIN, database_iri=_convert_database_iri(source.get('provider_id'), 'preprint'), diff --git a/osf_tests/metrics/test_es8_metrics.py b/osf_tests/metrics/test_es8_metrics.py index ce562a026b4..a871054e96b 100644 --- a/osf_tests/metrics/test_es8_metrics.py +++ b/osf_tests/metrics/test_es8_metrics.py @@ -76,7 +76,7 @@ def test_none_pageview_nested_autofill(self): platform_iri='https://osf.example', user_is_authenticated=False, ) - assert usage.pageview_info is None + assert not usage.pageview_info assert usage.item_iri in usage.within_iris def test_save_report(self): diff --git a/poetry.lock b/poetry.lock index 1aec6afa426..4fcf24cabd1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "34c7b180e6d595b3374534cd50efb00f5a809582" -resolved_reference = "34c7b180e6d595b3374534cd50efb00f5a809582" +reference = "222f03e92ec45a86f76db7a0461ae4fc483b2810" +resolved_reference = "222f03e92ec45a86f76db7a0461ae4fc483b2810" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "9edb43576b960885c14e32e9ae74218c28d883df48679868848dbaa5780c4b12" +content-hash = "e510408fd1590e2ec46f022a6004e55df2c813f6e8688d0c6d75308f1dccf43b" diff --git a/pyproject.toml b/pyproject.toml index 815efdd61a6..ade2030afdd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "34c7b180e6d595b3374534cd50efb00f5a809582"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "222f03e92ec45a86f76db7a0461ae4fc483b2810"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From a65d6a580159eabd5cf6fdae8ab89b5d9ade5cfb Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 17:28:49 -0400 Subject: [PATCH 061/100] fix: preprint metric conversion --- osf/management/commands/migrate_osfmetrics_6to8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index 04afa94b6b9..92b01e913c3 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -339,7 +339,7 @@ def _convert_preprint_metric( return es8_metrics.OsfCountedUsageRecord.record( using=False, # don't save yet; will save in bulk # fields used to compute a sessionhour_id: - timestamp=source['timestamp'], + timestamp=datetime.datetime.fromisoformat(source['timestamp']), user_id=source.get('user_id'), client_session_id=str(uuid.uuid4()), # fields from djelme.CountedUsageRecord: From 2059a5657e8d60312da3b9a1a99d2fe129dfc5be Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Tue, 21 Apr 2026 17:28:58 -0400 Subject: [PATCH 062/100] fix: osf_shell --- osf/management/commands/osf_shell.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/osf/management/commands/osf_shell.py b/osf/management/commands/osf_shell.py index 851895623ac..69443d004be 100644 --- a/osf/management/commands/osf_shell.py +++ b/osf/management/commands/osf_shell.py @@ -32,7 +32,7 @@ def get_user_imports(): from django.db.models import Model from django_extensions.management.commands import shell_plus from django_extensions.management.utils import signalcommand -from elasticsearch_metrics.registry import registry as metrics_registry +from elasticsearch_metrics.registry import djelme_registry def header(text): @@ -160,7 +160,7 @@ def get_osf_imports(self): def get_metrics(self): return { each.__name__: each - for each in metrics_registry.get_metrics() + for each in djelme_registry.each_recordtype() } def get_grouped_imports(self, options): From c186373defd8b8bb732b1410fe320bb6c9553236 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Wed, 22 Apr 2026 15:58:00 -0400 Subject: [PATCH 063/100] per-deployment djelme index name prefix --- api/base/settings/defaults.py | 3 +++ poetry.lock | 6 +++--- pyproject.toml | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/api/base/settings/defaults.py b/api/base/settings/defaults.py index 72e169c25a1..8f3683b6115 100644 --- a/api/base/settings/defaults.py +++ b/api/base/settings/defaults.py @@ -325,6 +325,7 @@ }, 'osfmetrics_es8': { 'elasticsearch_metrics.imps.elastic8': { + # passthru kwargs to elasticsearch8 connection constructor 'hosts': osf_settings.ELASTIC8_URI, 'ca_certs': osf_settings.ELASTIC8_CERT_PATH, 'basic_auth': ( @@ -332,6 +333,8 @@ if osf_settings.ELASTIC8_SECRET is not None else None ), + # djelme-specific kwargs + 'djelme_default_index_name_prefix': osf_settings.SHARE_PROVIDER_PREPEND, }, }, } diff --git a/poetry.lock b/poetry.lock index 4fcf24cabd1..c16b7d021e0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "222f03e92ec45a86f76db7a0461ae4fc483b2810" -resolved_reference = "222f03e92ec45a86f76db7a0461ae4fc483b2810" +reference = "4e833670178beb682bb0d64e4f33db012cf8f014" +resolved_reference = "4e833670178beb682bb0d64e4f33db012cf8f014" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "e510408fd1590e2ec46f022a6004e55df2c813f6e8688d0c6d75308f1dccf43b" +content-hash = "d08b71fd886f9c6bd3d8d6cb1eda9f08431b7e84398b107e25f0371a4111266b" diff --git a/pyproject.toml b/pyproject.toml index ade2030afdd..fcc0decc86d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "222f03e92ec45a86f76db7a0461ae4fc483b2810"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "4e833670178beb682bb0d64e4f33db012cf8f014"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From e161f5d9e2480a53022df20823300378e3ff7b01 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Thu, 23 Apr 2026 13:45:12 -0400 Subject: [PATCH 064/100] better counted-usage autofill (and item_type iris) --- .../commands/migrate_osfmetrics_6to8.py | 93 ++++++++++--------- osf/metadata/osf_gathering.py | 63 ++----------- osf/metadata/osfmap_utils.py | 65 +++++++++++++ osf/metadata/serializers/linkset.py | 4 +- osf/metrics/es8_metrics.py | 87 +++++++++++++++-- 5 files changed, 206 insertions(+), 106 deletions(-) create mode 100644 osf/metadata/osfmap_utils.py diff --git a/osf/management/commands/migrate_osfmetrics_6to8.py b/osf/management/commands/migrate_osfmetrics_6to8.py index 92b01e913c3..b77d7b6af92 100644 --- a/osf/management/commands/migrate_osfmetrics_6to8.py +++ b/osf/management/commands/migrate_osfmetrics_6to8.py @@ -4,6 +4,7 @@ import logging import uuid +from django.apps import apps from django.core.management import call_command from django.core.management.base import BaseCommand from django.db import OperationalError as DjangoOperationalError @@ -16,6 +17,8 @@ from psycopg2 import OperationalError as PostgresOperationalError from framework.celery_tasks import app as celery_app +from osf.metadata.rdfutils import OSF +from osf.metadata.osfmap_utils import osfmap_type_from_model, osf_iri, is_osf_component from osf.metrics.preprint_metrics import ( PreprintView, PreprintDownload, @@ -131,6 +134,9 @@ def migrate_preprint_downloads(from_when: str, until_when: str): @celery_app.task(**_TASK_KWARGS) def migrate_usage_reports(osfid: str, until_when: str): # from PublicItemUsageReport to PublicItemUsageReportEs8 + _osfguid = osfdb.Guid.load(osfid) + _item_is_component = is_osf_component(_osfguid.referent) if _osfguid else False + def _each_new(): # go in sorted order to build cumulative counts # (only a few dozen of these per item; should be fine to sort and load all at once) @@ -144,7 +150,9 @@ def _each_new(): for _hit in list(_each_hit): yield ( _prior_report := _convert_public_usage_report( - _hit['_source'], _prior_report + _hit['_source'], + _prior_report, + item_is_component=_item_is_component, ) ) @@ -307,23 +315,25 @@ def _each_kwarg(): def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageRecord: - _item_iri = _iri_from_osfid(source['item_guid']) return es8_metrics.OsfCountedUsageRecord( # fields from djelme.CountedUsageRecord: timestamp=source['timestamp'], sessionhour_id=source['session_id'], platform_iri=source.get('platform_iri') or website_settings.DOMAIN, database_iri=_convert_database_iri( - source.get('provider_id'), source.get('item_type') + provider_id=source.get('provider_id'), + osf_model_name=source.get('item_type'), ), - item_iri=_item_iri, within_iris=[ - _iri_from_osfid(_within_osfid) + osf_iri(_within_osfid) for _within_osfid in source.get('surrounding_guids', ()) ], # fields from OsfCountedUsageRecord: item_osfid=source['item_guid'], - item_type=source.get('item_type', 'osf:Object'), + item_type=_convert_item_type( + source.get('item_type'), + has_surrounding_items=bool(source.get('surrounding_guids')), + ), item_public=source.get('item_public'), provider_id=source.get('provider_id'), user_is_authenticated=source.get('user_is_authenticated'), @@ -335,7 +345,6 @@ def _convert_counted_usage(source: dict) -> es8_metrics.OsfCountedUsageRecord: def _convert_preprint_metric( source: dict, action_labels: list[str] ) -> es8_metrics.OsfCountedUsageRecord: - _preprint_iri = _iri_from_osfid(source['preprint_id']) return es8_metrics.OsfCountedUsageRecord.record( using=False, # don't save yet; will save in bulk # fields used to compute a sessionhour_id: @@ -344,12 +353,13 @@ def _convert_preprint_metric( client_session_id=str(uuid.uuid4()), # fields from djelme.CountedUsageRecord: platform_iri=website_settings.DOMAIN, - database_iri=_convert_database_iri(source.get('provider_id'), 'preprint'), - item_iri=_preprint_iri, - within_iris=[_preprint_iri], + database_iri=_convert_database_iri( + provider_id=source.get('provider_id'), + osf_model_name='preprint', + ), # fields from OsfCountedUsageRecord: item_osfid=source['preprint_id'], - item_type='preprint', + item_type=OSF.Preprint, item_public=True, provider_id=source.get('provider_id'), user_is_authenticated=bool(source.get('user_id')), @@ -360,12 +370,13 @@ def _convert_preprint_metric( def _convert_public_usage_report( source: dict, prior_report: es8_metrics.PublicItemUsageReportEs8 | None, + item_is_component: bool, ) -> es8_metrics.PublicItemUsageReportEs8: if prior_report is None: _c_views, _c_view_sess, _c_downloads, _c_download_sess = _get_cumulative_usage( osfid=source['item_osfid'], until_when=YearMonth.from_str(source['report_yearmonth']).month_end(), - item_type=source.get('item_type'), + is_preprint=(source.get('item_type') == 'preprint'), ) else: _c_views = prior_report.cumulative_view_count + source.get('view_count', 0) @@ -381,7 +392,10 @@ def _convert_public_usage_report( return es8_metrics.PublicItemUsageReportEs8( cycle_coverage=_semverish_from_yearmonth(source['report_yearmonth']), item_osfid=source['item_osfid'], - item_type=source.get('item_type'), + item_type=_convert_item_type( + source.get('item_type'), + has_surrounding_items=item_is_component, + ), provider_id=source.get('provider_id'), platform_iri=source.get('platform_iri') or website_settings.DOMAIN, view_count=source.get('view_count'), @@ -395,8 +409,8 @@ def _convert_public_usage_report( ) -def _get_cumulative_usage(osfid: str, until_when, item_type: str | None): - if item_type == 'preprint': +def _get_cumulative_usage(osfid: str, until_when, *, is_preprint: bool): + if is_preprint: _views = _cumulative_preprint_count(PreprintView, osfid, until_when) _downloads = _cumulative_preprint_count(PreprintDownload, osfid, until_when) _view_sess, _download_sess = 0, 0 # no session info on preprints (yet) @@ -491,46 +505,37 @@ def _cumulative_preprint_count(preprint_metric_cls, osfid: str, until_when: str) return _view_count -def _iri_from_osfid(osfid: str) -> str: - return f'{website_settings.DOMAIN}{osfid}' +def _convert_item_type(osf_model_name: str | None, has_surrounding_items: bool): + if osf_model_name: + try: + return osfmap_type_from_model( + apps.get_model('osf', osf_model_name), + is_component=has_surrounding_items, + ) + except LookupError: + pass + return OSF.Object # fine, fallback to abstract type -@functools.lru_cache -def _convert_database_iri(provider_id: str | None, item_type: str) -> str: +def _convert_database_iri(provider_id: str | None, osf_model_name: str) -> str: if not provider_id: return website_settings.DOMAIN # osf is a provider, sure why not - def _fallback_iri(): - return f'urn:osf.io:{provider_id}' - - match item_type: # lower-cased osf.models class names - case 'node' | 'osfuser': - # implicit 'osf' provider + match osf_model_name: # lower-cased osf.models class names + case 'node' | 'osfuser': # implicit untyped 'osf' provider return website_settings.DOMAIN - case 'preprint': - try: - _provider = osfdb.PreprintProvider.objects.get(_id=provider_id) - except osfdb.PreprintProvider.DoesNotExist: - _logger.error(f'unknown preprint provider {provider_id!r}') - return _fallback_iri() - else: - return _provider.get_semantic_iri() - case 'registration': - try: - _provider = osfdb.RegistrationProvider.objects.get(_id=provider_id) - except osfdb.RegistrationProvider.DoesNotExist: - _logger.error(f'unknown registration provider {provider_id!r}') - return _fallback_iri() - else: - return _provider.get_semantic_iri() - case _ if 'file' in item_type: + case 'preprint': # match PreprintProvider.get_semantic_iri + return f'{website_settings.DOMAIN}preprints/{provider_id}' + case 'registration': # match RegistrationProvider.get_semantic_iri + return f'{website_settings.DOMAIN}registries/{provider_id}' + case _ if 'file' in osf_model_name: # file providers are a different thing that don't really have an iri, just an id return f'urn:files.osf.io:{provider_id}' case _: # give up gracefully _logger.error( - f'unknown item type {item_type!r} with provider {provider_id!r}' + f'unknown model {osf_model_name!r} with provider {provider_id!r}' ) - return _fallback_iri() + return f'urn:osf.io:{provider_id}' def _each_usage_report_osfid(until_when, after_osfid=None): diff --git a/osf/metadata/osf_gathering.py b/osf/metadata/osf_gathering.py index dfa74612bd0..118151627d3 100644 --- a/osf/metadata/osf_gathering.py +++ b/osf/metadata/osf_gathering.py @@ -13,6 +13,12 @@ from osf import models as osfdb from osf.metadata import gather from osf.metadata.definitions.datacite import DATACITE_RESOURCE_TYPES_GENERAL +from osf.metadata.osfmap_utils import ( + osfmap_type, + osf_iri, + is_osf_component, + osfid_from_iri, +) from osf.metadata.rdfutils import ( DATACITE, DCAT, @@ -30,7 +36,6 @@ SKOS, checksum_iri, format_dcterms_extent, - without_namespace, smells_like_iri, ) from osf.metrics.reports import PublicItemUsageReport @@ -319,15 +324,13 @@ def get_expiration_date(self, basket: gather.Basket) -> datetime.date | None: ##### END osfmap ##### -##### BEGIN osf-specific utils ##### - class OsfFocus(gather.Focus): def __init__(self, osf_item): if isinstance(osf_item, str): osf_item = osfdb.base.coerce_guid(osf_item).referent super().__init__( iri=osf_iri(osf_item), - rdftype=get_rdf_type(osf_item), + rdftype=osfmap_type(osf_item), provider_id=osf_item.provider._id if (osf_item and getattr(osf_item, 'type', '') == 'osf.registration' and osf_item.provider) else None ) self.dbmodel = osf_item @@ -337,54 +340,6 @@ def __init__(self, osf_item): pass # is ok for a focus to be something non-osfguidy -def is_root(osf_node): - return (osf_node.root_id == osf_node.id) - - -def get_rdf_type(osfguid_referent): - if isinstance(osfguid_referent, osfdb.Guid): - osfguid_referent = osfguid_referent.referent - - if isinstance(osfguid_referent, osfdb.OSFUser): - return DCTERMS.Agent - if isinstance(osfguid_referent, osfdb.BaseFileNode): - return OSF.File - if isinstance(osfguid_referent, osfdb.Preprint): - return OSF.Preprint - if isinstance(osfguid_referent, osfdb.Registration): - return ( - OSF.Registration - if is_root(osfguid_referent) - else OSF.RegistrationComponent - ) - if isinstance(osfguid_referent, osfdb.Node): - return ( - OSF.Project - if is_root(osfguid_referent) - else OSF.ProjectComponent - ) - raise NotImplementedError - - -def osf_iri(guid_or_model): - """return a rdflib.URIRef or None - - @param guid_or_model: a string, Guid instance, or another osf model instance - @returns rdflib.URIRef or None - """ - guid = osfdb.base.coerce_guid(guid_or_model) - return OSFIO[guid._id] - - -def osfguid_from_iri(iri: str) -> str: - if iri.startswith(OSFIO): - return without_namespace(iri, OSFIO) - raise ValueError(f'expected iri starting with "{OSFIO}" (got "{iri}")') - - -##### END osf-specific utils ##### - - ##### BEGIN the gatherers ##### # @@ -718,7 +673,7 @@ def gather_file_mediatype(focus): @gather.er(DCTERMS.hasPart, DCTERMS.isPartOf) def gather_parts(focus): if isinstance(focus.dbmodel, osfdb.AbstractNode): - if not is_root(focus.dbmodel) and focus.dbmodel.root.is_public: + if is_osf_component(focus.dbmodel) and focus.dbmodel.root.is_public: root_focus = OsfFocus(focus.dbmodel.root) yield (OSF.hasRoot, root_focus) child_relations = ( @@ -1130,7 +1085,7 @@ def gather_cedar_templates(focus): @gather.er(OSF.usage) def gather_last_month_usage(focus): _usage_report = PublicItemUsageReport.for_last_month( - item_osfid=osfguid_from_iri(focus.iri), + item_osfid=osfid_from_iri(focus.iri), ) if _usage_report is not None: _usage_report_ref = rdflib.BNode() diff --git a/osf/metadata/osfmap_utils.py b/osf/metadata/osfmap_utils.py new file mode 100644 index 00000000000..e3e9ab89a9c --- /dev/null +++ b/osf/metadata/osfmap_utils.py @@ -0,0 +1,65 @@ +from osf.metadata.rdfutils import ( + DCTERMS, + OSF, + OSFIO, + without_namespace, +) +from osf import models as osfdb + + +def is_osf_component(osf_node) -> bool: + return ( + isinstance(osf_node, osfdb.AbstractNode) + and osf_node.root_id != osf_node.id + ) + + +def osfmap_type_from_model(model_cls, *, is_component=None): + if issubclass(model_cls, osfdb.OSFUser): + return DCTERMS.Agent + if issubclass(model_cls, osfdb.BaseFileNode): + return OSF.File + if issubclass(model_cls, osfdb.Preprint): + return OSF.Preprint + if issubclass(model_cls, osfdb.Registration): + if is_component is None: + raise ValueError(f'osfmap_type_from_model requires `is_component` for {model_cls}') + return ( + OSF.RegistrationComponent + if is_component + else OSF.Registration + ) + if issubclass(model_cls, osfdb.Node): + if is_component is None: + raise ValueError(f'osfmap_type_from_model requires `is_component` for {model_cls}') + return ( + OSF.ProjectComponent + if is_component + else OSF.Project + ) + raise LookupError(model_cls) + + +def osfmap_type(osf_obj): + if isinstance(osf_obj, osfdb.Guid): + osf_obj = osf_obj.referent + return osfmap_type_from_model(type(osf_obj), is_component=is_osf_component(osf_obj)) + + +def osf_iri(guid_or_model): + """return a rdflib.URIRef or None + + @param guid_or_model: a string, Guid instance, or another osf model instance + @returns rdflib.URIRef or None + """ + guid = osfdb.base.coerce_guid(guid_or_model) + return OSFIO[guid._id] + + +def osfid_from_iri(iri: str) -> str: + if not iri.startswith(OSFIO): + raise ValueError(f'expected iri starting with "{OSFIO}" (got {iri!r})') + _osfid = without_namespace(iri, OSFIO) + if not _osfid or '/' in _osfid: + raise ValueError(f'expected iri path with exactly one segment (got {_osfid!r} from {iri!r})') + return _osfid diff --git a/osf/metadata/serializers/linkset.py b/osf/metadata/serializers/linkset.py index f83dad00ebd..3ee907d0532 100644 --- a/osf/metadata/serializers/linkset.py +++ b/osf/metadata/serializers/linkset.py @@ -16,7 +16,7 @@ import rdflib from ._base import MetadataSerializer -from osf.metadata.osf_gathering import osfguid_from_iri +from osf.metadata.osf_gathering import osfid_from_iri from osf.metadata.rdfutils import (DOI, DATACITE, DCTERMS, OWL, RDF, OSF, DCAT, SCHEMA, DATACITE_SCHEMA_RESOURCE_TYPE_GENERAL_MAPPING, map_resource_type_general_datacite_to_scheme) from website.settings import DOMAIN from website.util import web_url_for @@ -74,7 +74,7 @@ def _each_link(self) -> Iterator[SignpostLink]: base_metadata_url = urljoin(DOMAIN, web_url_for( 'metadata_download', # name of a view function mapped in website/routes.py - guid=osfguid_from_iri(self.basket.focus.iri), + guid=osfid_from_iri(self.basket.focus.iri), )) split_base_metadata_url = urlsplit(base_metadata_url) diff --git a/osf/metrics/es8_metrics.py b/osf/metrics/es8_metrics.py index 4c46710748c..3bc573865c3 100644 --- a/osf/metrics/es8_metrics.py +++ b/osf/metrics/es8_metrics.py @@ -1,12 +1,21 @@ import datetime import enum +import functools from urllib.parse import urlsplit import elasticsearch8.dsl as esdsl from elasticsearch_metrics import DAILY, MONTHLY, YEARLY import elasticsearch_metrics.imps.elastic8 as djelme +from osf.metadata.osfmap_utils import ( + osfmap_type, + osf_iri, + osfid_from_iri, +) +from osf.metrics.counted_usage import _get_surrounding_guids from osf.metrics.utils import YearMonth +from osf import models as osfdb +from website import settings as website_settings ### @@ -99,9 +108,69 @@ class OsfCountedUsageRecord(djelme.CountedUsageRecord): action_labels: list[str] pageview_info: PageviewInfo | None + @functools.cached_property + def _osfid_referent(self): + # for use by autofill methods, if needed + return osfdb.Guid.load(self.item_osfid) + def clean(self): super().clean() - # autofill pageview_info fields + self._autofill_item_iri_and_osfid() + self._autofill_item_public() + self._autofill_item_type() + self._autofill_provider_id() + self._autofill_within_iris() + self._autofill_pageview() + self._autofill_database_iri() + + def _autofill_item_iri_and_osfid(self): + if self.item_osfid and not self.item_iri: + self.item_iri = osf_iri(self.item_osfid) + elif self.item_iri and not self.item_osfid: + try: + self.item_osfid = osfid_from_iri(self.item_iri) + except ValueError: + pass + + def _autofill_item_public(self): + if self.item_osfid and (self.item_public is None): + _item = self._osfid_referent + # if it quacks like BaseFileNode, look at .target instead + _item = getattr(_item, 'target', None) or _item + self.item_public = ( + _item.verified_publishable # quacks like Preprint + if hasattr(_item, 'verified_publishable') + else getattr(_item, 'is_public', False) # quacks like AbstractNode + ) + + def _autofill_item_type(self): + if self.item_osfid and not self.item_type: + self.item_type = osfmap_type(self._osfid_referent) + + def _autofill_provider_id(self): + if self.item_osfid and not self.provider_id: + _provider = getattr(self._osfid_referent, 'provider', None) + if _provider is None: + self.provider_id = 'osf' # quacks like Node, Comment, WikiPage + elif isinstance(_provider, str): + self.provider_id = _provider # quacks like BaseFileNode + else: + self.provider_id = _provider._id # quacks like Registration, Preprint, Collection + + def _autofill_within_iris(self): + if self.item_osfid and (self.within_iris is None) and self._osfid_referent: + self.within_iris = [ + osf_iri(_osfid) + for _osfid in _get_surrounding_guids(self._osfid_referent) + ] + # ensure inclusive "within" + if not self.within_iris: + self.within_iris = [self.item_iri] + if self.item_iri not in self.within_iris: + self.within_iris = [self.item_iri, *self.within_iris] + + def _autofill_pageview(self): + # autofill pageview_info fields from other fields if self.pageview_info: self.pageview_info.hour_of_day = self.timestamp.hour _url = self.pageview_info.page_url @@ -110,11 +179,17 @@ def clean(self): _ref_url = self.pageview_info.referer_url if _ref_url: self.pageview_info.referer_domain = urlsplit(_ref_url).netloc - # ensure inclusive "within" - if not self.within_iris: - self.within_iris = [self.item_iri] - elif self.item_iri not in self.within_iris: - self.within_iris = [self.item_iri, *self.within_iris] + + def _autofill_database_iri(self): + if self.item_osfid and not self.database_iri: + _provider = getattr(self._osfid_referent, 'provider', None) + if not _provider: + self.database_iri = website_settings.DOMAIN + elif isinstance(_provider, str): + # file providers are a different thing that don't really have an iri, just an id + self.database_iri = f'urn:files.osf.io:{self.provider_id}' + else: + self.database_iri = _provider.get_semantic_iri() def _get_unique_together_values(self): """get "unique together" values for "ON CONFLICT UPDATE" behavior From 45d1e30ccaed64b7a274c0d6748af65feb088e1a Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 24 Apr 2026 09:32:18 -0400 Subject: [PATCH 065/100] osf-admin migrate_osfmetrics_6to8 --- admin/management/urls.py | 5 +++-- admin/management/views.py | 20 ++++++++++++++++++++ admin/templates/management/commands.html | 23 +++++++++++++++++++++++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/admin/management/urls.py b/admin/management/urls.py index d583deb2ce0..79c5be0a7a2 100644 --- a/admin/management/urls.py +++ b/admin/management/urls.py @@ -1,4 +1,4 @@ -from django.urls import re_path +from django.urls import re_path, path from admin.management import views @@ -19,5 +19,6 @@ re_path(r'^empty_metadata_dataarchive_registration_bulk_resync', views.EmptyMetadataDataarchiveRegistrationBulkResync.as_view(), name='empty-metadata-dataarchive-registration-bulk-resync'), re_path(r'^sync_notification_templates', views.SyncNotificationTemplates.as_view(), - name='sync_notification_templates') + name='sync_notification_templates'), + path('migrate_osfmetrics_6to8', views.MigrateOsfmetrics6to8.as_view(), name='migrate_osfmetrics_6to8'), ] diff --git a/admin/management/views.py b/admin/management/views.py index 36f3d893f24..c390d08e629 100644 --- a/admin/management/views.py +++ b/admin/management/views.py @@ -1,9 +1,12 @@ +from io import StringIO + from dateutil.parser import isoparse from django.views.generic import TemplateView, View from django.contrib import messages from django.http import HttpResponse from django.utils import timezone from django.contrib.auth.mixins import PermissionRequiredMixin +from django.core.management import call_command, CommandError from osf.management.commands.manage_switch_flags import manage_waffle from osf.management.commands.update_registration_schemas import update_registration_schemas @@ -181,3 +184,20 @@ def post(self, request): populate_notification_types() messages.success(request, 'Notification templates have been successfully synced.') return redirect(reverse('management:commands')) + + +class MigrateOsfmetrics6to8(ManagementCommandPermissionView): + def post(self, request): + _command_kwargs = { + 'no_setup': True, + 'no_counts': request.POST.get('no_counts'), + 'clear_state': request.POST.get('clear_state'), + 'start': request.POST.get('start'), + 'unchanged': request.POST.get('unchanged'), + 'usage_reports': request.POST.get('usage_reports'), + 'usage_events': request.POST.get('usage_events'), + } + _out_io = StringIO() + call_command('migrate_osfmetrics_6to8', **_command_kwargs, stdout=_out_io) + messages.info(request, _out_io.getvalue()) + return redirect(reverse('management:commands')) diff --git a/admin/templates/management/commands.html b/admin/templates/management/commands.html index dd90affd5ff..ae4ea406b00 100644 --- a/admin/templates/management/commands.html +++ b/admin/templates/management/commands.html @@ -165,6 +165,29 @@

Sync Notification Templates

+
+

migrate osf-metrics 6to8

+

+ view progress of the osf-metrics migration from elastic6 to elastic8 (or start it) +

+
+ {% csrf_token %} + + + +
+ default all if unselected: + + + +
+ +
+
{% endblock %} From 2537561f7d682c5273ef7bb33df864cc93056db2 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 24 Apr 2026 08:44:17 -0400 Subject: [PATCH 066/100] /_/metrics/raw-es8_metrics/... --- api/metrics/urls.py | 4 +++- api/metrics/views.py | 32 +++++++++++++++++---------- api_tests/metrics/test_raw_metrics.py | 12 +++++----- poetry.lock | 6 ++--- pyproject.toml | 2 +- 5 files changed, 34 insertions(+), 22 deletions(-) diff --git a/api/metrics/urls.py b/api/metrics/urls.py index e135212541c..db63df3dd4c 100644 --- a/api/metrics/urls.py +++ b/api/metrics/urls.py @@ -5,7 +5,9 @@ app_name = 'osf' urlpatterns = [ - re_path(r'^raw/(?P[a-z0-9._/]*)$', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name), + re_path(r'^raw/(?P[a-z0-9._/]*)$', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name, kwargs={'djelme_backend_name': 'osfmetrics_es6'}), + path('raw-/', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name, kwargs={'url_path': ''}), + path('raw-/', views.RawMetricsView.as_view(), name=views.RawMetricsView.view_name), re_path(r'^preprints/views/$', views.PreprintViewMetrics.as_view(), name=views.PreprintViewMetrics.view_name), re_path(r'^preprints/downloads/$', views.PreprintDownloadMetrics.as_view(), name=views.PreprintDownloadMetrics.view_name), re_path(r'^registries_moderation/transitions/$', views.RegistriesModerationMetricsView.as_view(), name=views.RegistriesModerationMetricsView.view_name), diff --git a/api/metrics/views.py b/api/metrics/views.py index c6e4d56c9b9..69c44027ec9 100644 --- a/api/metrics/views.py +++ b/api/metrics/views.py @@ -8,6 +8,7 @@ from elasticsearch6.exceptions import NotFoundError, RequestError from elasticsearch6_dsl.connections import get_connection +from elasticsearch_metrics.registry import djelme_registry from framework.auth.oauth_scopes import CoreScopes @@ -225,24 +226,31 @@ def delete(self, request, *args, **kwargs): raise ValidationError('DELETE not supported. Use GET/POST/PUT') @require_switch(ENABLE_RAW_METRICS) - def get(self, request, *args, **kwargs): - connection = get_connection() - url_path = kwargs['url_path'] - return JsonResponse(connection.transport.perform_request('GET', f'/{url_path}')) + def get(self, request, *args, djelme_backend_name, url_path, **kwargs): + connection = self._get_es_connection(djelme_backend_name) + _response = connection.transport.perform_request('GET', f'/{url_path}') + return JsonResponse(_response if isinstance(_response, dict) else _response.body) @require_switch(ENABLE_RAW_METRICS) - def post(self, request, *args, **kwargs): - connection = get_connection() - url_path = kwargs['url_path'] + def post(self, request, *args, djelme_backend_name, url_path, **kwargs): + connection = self._get_es_connection(djelme_backend_name) body = json.loads(request.body) - return JsonResponse(connection.transport.perform_request('POST', f'/{url_path}', body=body)) + _response = connection.transport.perform_request('POST', f'/{url_path}', body=body) + return JsonResponse(_response if isinstance(_response, dict) else _response.body) @require_switch(ENABLE_RAW_METRICS) - def put(self, request, *args, **kwargs): - connection = get_connection() - url_path = kwargs['url_path'] + def put(self, request, *args, djelme_backend_name, url_path, **kwargs): + connection = self._get_es_connection(djelme_backend_name) body = json.loads(request.body) - return JsonResponse(connection.transport.perform_request('PUT', f'/{url_path}', body=body)) + _response = connection.transport.perform_request('PUT', f'/{url_path}', body=body) + return JsonResponse(_response if isinstance(_response, dict) else _response.body) + + def _get_es_connection(self, djelme_backend_name): + try: + _backend = djelme_registry.get_backend(djelme_backend_name) + except LookupError: + raise Http404 + return _backend.elastic_client class RegistriesModerationMetricsView(GenericAPIView): diff --git a/api_tests/metrics/test_raw_metrics.py b/api_tests/metrics/test_raw_metrics.py index 6a3b9b8f8c5..e32936d9024 100644 --- a/api_tests/metrics/test_raw_metrics.py +++ b/api_tests/metrics/test_raw_metrics.py @@ -1,10 +1,10 @@ import pytest -import time from website.app import setup_django setup_django() from waffle.testutils import override_switch +from elasticsearch6_dsl.connections import connections as es6_connections from osf import features from osf_tests.factories import AuthUserFactory @@ -40,9 +40,9 @@ def user(self): def other_user(self): return AuthUserFactory() - @pytest.fixture - def base_url(self): - return f'/{API_BASE}metrics/raw/' + @pytest.fixture(params=['raw', 'raw-osfmetrics_es6']) + def base_url(self, request): + return f'/{API_BASE}metrics/{request.param}/' def test_delete(self, app, user, base_url): res = app.delete_json_api(base_url, auth=user.auth, expect_errors=True) @@ -136,7 +136,9 @@ def test_post_and_get(self, app, user, base_url): res = app.post_json_api(post_url, post_data, auth=user.auth) assert res.json == post_return - time.sleep(3) + es6_connections.get_connection('osfmetrics_es6').indices.refresh( + index='customer', + ) get_url = f'{base_url}customer/_search?q=*' res = app.get(get_url, auth=user.auth) diff --git a/poetry.lock b/poetry.lock index c16b7d021e0..37f0a9a8292 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1101,8 +1101,8 @@ elastic8 = ["elasticsearch8 (>=8.0.0,<9.0.0)"] [package.source] type = "git" url = "https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git" -reference = "4e833670178beb682bb0d64e4f33db012cf8f014" -resolved_reference = "4e833670178beb682bb0d64e4f33db012cf8f014" +reference = "f2b92e5509389bb6c33f5a90c9ca4fe4e68187e2" +resolved_reference = "f2b92e5509389bb6c33f5a90c9ca4fe4e68187e2" [[package]] name = "django-extensions" @@ -4711,4 +4711,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.1" python-versions = "^3.12" -content-hash = "d08b71fd886f9c6bd3d8d6cb1eda9f08431b7e84398b107e25f0371a4111266b" +content-hash = "fe2cf66c0cc6f72e6d6191ac07a2e1ca874324afc19e2d073a51ce69422e75e5" diff --git a/pyproject.toml b/pyproject.toml index fcc0decc86d..9cdd094dde1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,7 +91,7 @@ datacite = "1.1.3" rdflib = "7.0.0" colorlog = "6.8.2" # Metrics -django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "4e833670178beb682bb0d64e4f33db012cf8f014"} +django-elasticsearch-metrics = {git ="https://github.com/CenterForOpenScience/django-elasticsearch-metrics.git", rev = "f2b92e5509389bb6c33f5a90c9ca4fe4e68187e2"} # Impact Metrics CSV Export djangorestframework-csv = "3.0.2" gevent = "24.2.1" From 4084a367f2cd8e2fa88d0ed46caab2556fc776e0 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 24 Apr 2026 12:38:21 -0400 Subject: [PATCH 067/100] better 6to8 error handling --- admin/management/views.py | 6 +++-- admin/templates/management/commands.html | 5 ++-- .../commands/migrate_osfmetrics_6to8.py | 25 ++++++++++--------- osf/metadata/osfmap_utils.py | 8 ++++-- 4 files changed, 26 insertions(+), 18 deletions(-) diff --git a/admin/management/views.py b/admin/management/views.py index c390d08e629..cdde3dfa7a3 100644 --- a/admin/management/views.py +++ b/admin/management/views.py @@ -6,7 +6,7 @@ from django.http import HttpResponse from django.utils import timezone from django.contrib.auth.mixins import PermissionRequiredMixin -from django.core.management import call_command, CommandError +from django.core.management import call_command from osf.management.commands.manage_switch_flags import manage_waffle from osf.management.commands.update_registration_schemas import update_registration_schemas @@ -190,6 +190,7 @@ class MigrateOsfmetrics6to8(ManagementCommandPermissionView): def post(self, request): _command_kwargs = { 'no_setup': True, + 'no_color': True, 'no_counts': request.POST.get('no_counts'), 'clear_state': request.POST.get('clear_state'), 'start': request.POST.get('start'), @@ -199,5 +200,6 @@ def post(self, request): } _out_io = StringIO() call_command('migrate_osfmetrics_6to8', **_command_kwargs, stdout=_out_io) - messages.info(request, _out_io.getvalue()) + for _line in _out_io.getvalue().split('\n'): + messages.info(request, _line) return redirect(reverse('management:commands')) diff --git a/admin/templates/management/commands.html b/admin/templates/management/commands.html index ae4ea406b00..ceed7ac77e1 100644 --- a/admin/templates/management/commands.html +++ b/admin/templates/management/commands.html @@ -174,14 +174,15 @@

migrate osf-metrics 6to8

action="{% url 'management:migrate_osfmetrics_6to8'%}" style="display: flex; flex-direction: column;"> {% csrf_token %} + -
- default all if unselected: + (narrow types: + )