Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
84fd32f
Fix validation blocking table swap when production has duplicate pmids
paulalbert1 Mar 5, 2026
275b607
Merge pull request #74 from wcmc-its/docs/cleanup-and-agent-guide-dev
paulalbert1 Mar 5, 2026
12231dd
Merge pull request #72 from wcmc-its/fix/nih-duplicate-validation-byp…
paulalbert1 Mar 5, 2026
4605fc2
Updated journal impact score inference
paulalbert1 Mar 27, 2023
413043c
fix(percentile): replace broken percentile/rank/denominator logic in …
paulalbert1 Mar 26, 2026
6ad492e
feat(schema): add 4 new Feature Generator fields
paulalbert1 Apr 14, 2026
9bd507c
fix(migration): drop AFTER clauses to enable ALGORITHM=INSTANT
paulalbert1 Apr 14, 2026
1ad2244
fix(SP): fall back to publicationDateStandardized when articleYear is 0
paulalbert1 May 6, 2026
9c215fd
fix(retrieveNIH): correct column inversion in analysis_nih_cites load
paulalbert1 May 15, 2026
eae7c78
Merge pull request #76 from wcmc-its/fix/nih-cites-column-inversion-dev
paulalbert1 May 15, 2026
810bc17
fix(abstractImport): replace CSV bulk-load that silently dropped rows
paulalbert1 May 16, 2026
4520e69
Merge pull request #80 from wcmc-its/fix/abstract-import-loop-dev
paulalbert1 May 16, 2026
a97b535
feat(reporter): NIH RePORTER ETL with provenance reconciliation
paulalbert1 May 9, 2026
3d88b92
feat(reporter): capture NIH RePORTER project terms (#291)
paulalbert1 May 18, 2026
7cbf2e6
Merge pull request #82 from wcmc-its/feat/reporter-etl-dev
paulalbert1 May 18, 2026
001c1bd
fix(reporter): dedup projects by appl_id before grant_reporter_projec…
paulalbert1 May 18, 2026
813662c
Merge pull request #84 from wcmc-its/fix/reporter-projects-dedup-dev
paulalbert1 May 18, 2026
2ada77f
fix(docker): add missing COPY for retrieveReporter.py
paulalbert1 May 19, 2026
12ce6fd
Merge pull request #86 from wcmc-its/fix/dockerfile-retrievereporter-dev
paulalbert1 May 19, 2026
87d9443
fix(reporting_abstracts): repair cross-paper concatenation from old C…
paulalbert1 May 20, 2026
8e82e7a
fix: repair script handles generated columns; migration guard actuall…
paulalbert1 May 20, 2026
8ff6cda
Merge pull request #88 from wcmc-its/fix/abstract-corruption-cleanup-dev
paulalbert1 May 20, 2026
b3eaf6d
feat(setup): add durable authorship_review table
paulalbert1 Jun 8, 2026
9cd385b
Merge pull request #91 from wcmc-its/feature/authorship-review-table
paulalbert1 Jun 8, 2026
b6f97dc
feat(setup): add admin_users scope/proxy column migration (v1.5)
paulalbert1 Jun 9, 2026
ffd0631
Merge pull request #93 from wcmc-its/feature/admin-user-scope-columns
paulalbert1 Jun 9, 2026
c856c2c
feat(etl): scan ArticleProvenance (DynamoDB) -> article_provenance ta…
paulalbert1 Jun 14, 2026
acd26bb
Merge pull request #96 from wcmc-its/feature/article-provenance-etl
paulalbert1 Jun 14, 2026
fca972a
Revert "Merge pull request #96 from wcmc-its/feature/article-provenan…
paulalbert1 Jun 14, 2026
20bacf4
Merge pull request #97 from wcmc-its/revert/article-provenance-etl
paulalbert1 Jun 14, 2026
408c407
feat(setup): mirror RBAC permission tables + impersonatedByUserID column
paulalbert1 Jun 17, 2026
407aa44
Merge pull request #100 from wcmc-its/feature/rbac-schema-mirror-dev
paulalbert1 Jun 18, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@ update/*.log
update/app.log
update/retrieveNIH.log
update/temp/
retrieveNIH.log

# One-shot audit / repair artifacts (contain prod abstract text; never commit)
audit_abstracts.csv
audit_abstracts_dump.txt
invalid_pmids.txt
invalid_pmids.sql
reporting_abstracts_corrupt_backup_*.sql

# Legacy ML models (unused)
update/*.keras
Expand Down
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ ENV PYTHONUNBUFFERED=1

# Copy additional Python scripts
COPY update/retrieveNIH.py ./
COPY update/retrieveReporter.py ./
COPY update/retrieveAltmetric.py ./
COPY update/retrieveArticles.py ./
COPY update/updateReciterDB.py ./
Expand Down
76 changes: 76 additions & 0 deletions setup/alter_add_admin_user_scope_columns_v1.5.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
-- =============================================================================
-- Migration: Add admin_users scope/proxy columns (v1.5)
-- =============================================================================
-- Adds the three JSON scope columns the Publication Manager AdminUser model
-- now selects on every login:
-- - scope_person_types (JSON, nullable) — person-type curation scope
-- - scope_org_units (JSON, nullable) — org-unit curation scope
-- - proxy_person_ids (JSON, nullable) — proxied person identifiers
--
-- WHY THIS MIGRATION EXISTS:
-- ReCiter-Publication-Manager (dev branch, model commit 579d32f
-- "extend AdminUser model with scope/proxy JSON columns") issues
-- SELECT userID, personIdentifier, ..., scope_person_types,
-- scope_org_units, proxy_person_ids FROM admin_users
-- inside findOrcreateAdminUser during authentication. If admin_users is
-- missing these columns the SELECT fails with ER_BAD_FIELD_ERROR
-- ("Unknown column 'scope_person_types'"), the authorize() call throws, and
-- login returns 401 for every user. The columns must exist before the PM
-- dev branch is deployed against this database.
--
-- The fresh-build schema (setup/createDatabaseTableReciterDb.sql on master,
-- PR #92) already defines admin_users WITH these columns, so new databases
-- are fine. This migration brings EXISTING databases (e.g. the production
-- reciterdb, which predates #92 and has none of the three) up to that
-- schema. There was no ALTER path for existing DBs until now.
--
-- DURABILITY: admin_users is curator state, not a reporting export. It is NOT
-- in update/updateReciterDB.py's truncate list (`all_tables`) and is not
-- touched by any nightly stored procedure or ETL step, so these columns
-- persist across nightly reload.
--
-- Safe to run on prod and dev. Uses IF NOT EXISTS-style guards via an
-- information_schema check (no-op on re-run). Additive only — no existing
-- column or row is modified. Run BEFORE deploying the PM dev branch.
-- =============================================================================

-- -----------------------------------------------------------------------------
-- admin_users: + scope_person_types + scope_org_units + proxy_person_ids
-- -----------------------------------------------------------------------------

SET @db = DATABASE();

SET @sql = (SELECT IF(
(SELECT COUNT(*) FROM information_schema.columns
WHERE table_schema = @db AND table_name = 'admin_users'
AND column_name = 'scope_person_types') = 0,
'ALTER TABLE admin_users ADD COLUMN `scope_person_types` JSON DEFAULT NULL',
'SELECT ''admin_users.scope_person_types already exists'''));
PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;

SET @sql = (SELECT IF(
(SELECT COUNT(*) FROM information_schema.columns
WHERE table_schema = @db AND table_name = 'admin_users'
AND column_name = 'scope_org_units') = 0,
'ALTER TABLE admin_users ADD COLUMN `scope_org_units` JSON DEFAULT NULL',
'SELECT ''admin_users.scope_org_units already exists'''));
PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;

SET @sql = (SELECT IF(
(SELECT COUNT(*) FROM information_schema.columns
WHERE table_schema = @db AND table_name = 'admin_users'
AND column_name = 'proxy_person_ids') = 0,
'ALTER TABLE admin_users ADD COLUMN `proxy_person_ids` JSON DEFAULT NULL',
'SELECT ''admin_users.proxy_person_ids already exists'''));
PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;

-- -----------------------------------------------------------------------------
-- Verification
-- -----------------------------------------------------------------------------

SELECT table_name, column_name, data_type, is_nullable
FROM information_schema.columns
WHERE table_schema = DATABASE()
AND table_name = 'admin_users'
AND column_name IN ('scope_person_types', 'scope_org_units', 'proxy_person_ids')
ORDER BY ordinal_position;
80 changes: 80 additions & 0 deletions setup/alter_add_feature_generator_fields_v1.1.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
-- =============================================================================
-- Migration: Add 4 new Feature Generator fields (v1.1)
-- =============================================================================
-- Adds columns introduced by ReCiter Feature Generator:
-- - datePublicationAddedToPMC (top-level article field)
-- - feedbackScoreTextSimilarity (evidence.feedbackEvidence)
-- - feedbackScoreJournalTitleSimilarity (evidence.feedbackEvidence)
-- - feedbackScoreBibliographicCoupling (evidence.feedbackEvidence)
--
-- Safe to run on prod and dev. Uses IF NOT EXISTS-style guards via
-- information_schema check (no-op on re-run).
--
-- Run BEFORE deploying the updated Python ETL, otherwise LOAD DATA INFILE
-- will fail with "Unknown column" on the 4 new headers.
-- =============================================================================

-- -----------------------------------------------------------------------------
-- person_article: + datePublicationAddedToPMC + 3 feedback scores
-- -----------------------------------------------------------------------------

SET @db = DATABASE();

SET @sql = (SELECT IF(
(SELECT COUNT(*) FROM information_schema.columns
WHERE table_schema = @db AND table_name = 'person_article'
AND column_name = 'datePublicationAddedToPMC') = 0,
'ALTER TABLE person_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL',
'SELECT ''person_article.datePublicationAddedToPMC already exists'''));
PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;

SET @sql = (SELECT IF(
(SELECT COUNT(*) FROM information_schema.columns
WHERE table_schema = @db AND table_name = 'person_article'
AND column_name = 'feedbackScoreTextSimilarity') = 0,
'ALTER TABLE person_article ADD COLUMN `feedbackScoreTextSimilarity` float DEFAULT NULL',
'SELECT ''person_article.feedbackScoreTextSimilarity already exists'''));
PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;

SET @sql = (SELECT IF(
(SELECT COUNT(*) FROM information_schema.columns
WHERE table_schema = @db AND table_name = 'person_article'
AND column_name = 'feedbackScoreJournalTitleSimilarity') = 0,
'ALTER TABLE person_article ADD COLUMN `feedbackScoreJournalTitleSimilarity` float DEFAULT NULL',
'SELECT ''person_article.feedbackScoreJournalTitleSimilarity already exists'''));
PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;

SET @sql = (SELECT IF(
(SELECT COUNT(*) FROM information_schema.columns
WHERE table_schema = @db AND table_name = 'person_article'
AND column_name = 'feedbackScoreBibliographicCoupling') = 0,
'ALTER TABLE person_article ADD COLUMN `feedbackScoreBibliographicCoupling` float DEFAULT NULL',
'SELECT ''person_article.feedbackScoreBibliographicCoupling already exists'''));
PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;

-- -----------------------------------------------------------------------------
-- analysis_summary_article: + datePublicationAddedToPMC
-- (feedback scores NOT carried into summary — per-person-article only)
-- -----------------------------------------------------------------------------

SET @sql = (SELECT IF(
(SELECT COUNT(*) FROM information_schema.columns
WHERE table_schema = @db AND table_name = 'analysis_summary_article'
AND column_name = 'datePublicationAddedToPMC') = 0,
'ALTER TABLE analysis_summary_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL',
'SELECT ''analysis_summary_article.datePublicationAddedToPMC already exists'''));
PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;

-- -----------------------------------------------------------------------------
-- Verification
-- -----------------------------------------------------------------------------

SELECT table_name, column_name, data_type, is_nullable
FROM information_schema.columns
WHERE table_schema = DATABASE()
AND column_name IN (
'datePublicationAddedToPMC',
'feedbackScoreTextSimilarity',
'feedbackScoreJournalTitleSimilarity',
'feedbackScoreBibliographicCoupling')
ORDER BY table_name, ordinal_position;
134 changes: 134 additions & 0 deletions setup/alter_add_reporter_fields_v1.2.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
-- =============================================================================
-- Migration: NIH RePORTER integration (v1.2)
-- =============================================================================
-- Adds the tables needed to ingest pub-grant linkages and project metadata
-- from NIH RePORTER (https://api.reporter.nih.gov/v2/) and to track per-pair
-- provenance over time.
--
-- WHY SEPARATE TABLES (not columns on person_article_grant):
-- person_article_grant is TRUNCATE-reloaded by updateReciterDB.py every
-- night from ReCiter scoring output (see updateReciterDB.py:241). Any
-- provenance columns added directly to that table would be wiped on each
-- nightly run, defeating the purpose of *_first_seen tracking. The
-- provenance table below is updated incrementally and survives reloads.
--
-- WHAT'S CREATED:
-- 1. grant_reporter_project — RePORTER /projects/search results
-- 2. grant_reporter_link — RePORTER /publications/search results
-- 3. grant_provenance — long-lived per-(person, pmid, grant)
-- source-and-timestamp log
--
-- Safe to run on prod and dev. Idempotent (CREATE TABLE IF NOT EXISTS).
-- Run BEFORE deploying retrieveReporter.py.
-- =============================================================================

-- -----------------------------------------------------------------------------
-- grant_reporter_project — RePORTER project metadata
-- -----------------------------------------------------------------------------
-- One row per RePORTER appl_id returned by /projects/search for the configured
-- WCM org filter. Refreshed each ETL cycle (truncate-reload OK; no historical
-- state to preserve here — RePORTER is the source of truth).
--
-- abstract_text is stored here as a cross-reference. The Funding UI reads
-- abstracts from Postgres (Scholars-Profile-System) where they're joined to
-- InfoEd grant rows; this column exists for ad-hoc analysis and future
-- reciterdb-side consumers.
--
-- project_terms / pref_terms hold the NIH-curated keyword vocabulary RePORTER
-- returns per project, stored raw (project_terms angle-bracket-wrapped,
-- pref_terms semicolon-delimited). Added by alter_add_reporter_terms_v1.3.sql;
-- mirrored into the CREATE TABLE here so a fresh build matches (issue #291).
-- -----------------------------------------------------------------------------

CREATE TABLE IF NOT EXISTS `grant_reporter_project` (
`appl_id` int(11) NOT NULL,
`core_project_num` varchar(32) DEFAULT NULL,
`project_title` varchar(512) DEFAULT NULL,
`org_name` varchar(255) DEFAULT NULL,
`fiscal_year` smallint(6) DEFAULT NULL,
`activity_code` varchar(8) DEFAULT NULL,
`project_start_date` date DEFAULT NULL,
`project_end_date` date DEFAULT NULL,
`abstract_text` mediumtext DEFAULT NULL,
`project_terms` text DEFAULT NULL,
`pref_terms` text DEFAULT NULL,
`last_fetched_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`appl_id`),
KEY `core_project_num` (`core_project_num`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

-- -----------------------------------------------------------------------------
-- grant_reporter_link — RePORTER pub-grant linkages
-- -----------------------------------------------------------------------------
-- One row per (pmid, appl_id) pair returned by /publications/search.
-- Refreshed each ETL cycle (truncate-reload). The grant_provenance table
-- below is what carries history.
-- -----------------------------------------------------------------------------

CREATE TABLE IF NOT EXISTS `grant_reporter_link` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`pmid` int(11) NOT NULL,
`appl_id` int(11) NOT NULL,
`core_project_num` varchar(32) DEFAULT NULL,
`last_fetched_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
UNIQUE KEY `uk_pmid_appl_id` (`pmid`, `appl_id`),
KEY `pmid` (`pmid`),
KEY `core_project_num` (`core_project_num`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

-- -----------------------------------------------------------------------------
-- grant_provenance — per-(person, pmid, grant) source and timestamp log
-- -----------------------------------------------------------------------------
-- The audit log that survives nightly truncate-reload of person_article_grant.
-- Keyed by (personIdentifier, pmid, core_project_num) where core_project_num
-- is the normalized NIH grant identifier (e.g. "R01DK127777" — no year suffix,
-- no spaces). For non-NIH grants the original articleGrant string is stored
-- in core_project_num as a fallback so the row is still keyable.
--
-- Update logic (run nightly by retrieveReporter.py after person_article_grant
-- has been refreshed by retrieveArticles.py):
--
-- 1. UPSERT from person_article_grant: any (personIdentifier, pmid,
-- normalized_grant) currently in person_article_grant gets
-- source_reciterdb=1 and last_verified=NOW(). reciterdb_first_seen is
-- set on first insert and never overwritten.
--
-- 2. UPSERT from grant_reporter_link joined to person_article (where
-- userAssertion='ACCEPTED' to scope to confirmed WCM authors): any
-- (personIdentifier, pmid, core_project_num) seen in RePORTER gets
-- source_reporter=1 and last_verified=NOW(). reporter_first_seen is
-- set on first insert and never overwritten.
--
-- Subaward caution: see retrieveReporter.py — we filter RePORTER projects
-- to org_names=["WEILL MEDICAL COLL OF CORNELL UNIV"] and join PMIDs to
-- person_article ACCEPTED rows. This minimizes false positives at the cost
-- of missing some legitimate WCM-as-subaward linkages.
-- -----------------------------------------------------------------------------

CREATE TABLE IF NOT EXISTS `grant_provenance` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`personIdentifier` varchar(128) NOT NULL,
`pmid` int(11) NOT NULL,
`core_project_num` varchar(64) NOT NULL,
`appl_id` int(11) DEFAULT NULL,
`source_reporter` tinyint(1) NOT NULL DEFAULT 0,
`source_reciterdb` tinyint(1) NOT NULL DEFAULT 0,
`reporter_first_seen` datetime DEFAULT NULL,
`reciterdb_first_seen` datetime DEFAULT NULL,
`last_verified` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`),
UNIQUE KEY `uk_person_pmid_grant` (`personIdentifier`, `pmid`, `core_project_num`),
KEY `pmid` (`pmid`),
KEY `appl_id` (`appl_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

-- -----------------------------------------------------------------------------
-- Verification
-- -----------------------------------------------------------------------------

SELECT table_name, table_rows, create_time
FROM information_schema.tables
WHERE table_schema = DATABASE()
AND table_name IN ('grant_reporter_project', 'grant_reporter_link', 'grant_provenance')
ORDER BY table_name;
59 changes: 59 additions & 0 deletions setup/alter_add_reporter_terms_v1.3.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
-- =============================================================================
-- Migration: NIH RePORTER project terms (v1.3)
-- =============================================================================
-- Adds two columns to grant_reporter_project for the NIH-curated keyword
-- vocabulary RePORTER returns alongside the abstract:
-- - project_terms — RePORTER `terms`, angle-bracket-wrapped (<a><b><c>)
-- - pref_terms — RePORTER `pref_terms`, semicolon-delimited (a;b;c)
--
-- Stored raw, verbatim from the API. Parsing into a keyword array happens
-- downstream in the Scholars-Profile-System ETL (issue #291); reciterdb keeps
-- the unparsed strings so a future reciterdb-side consumer can re-parse.
--
-- WHY AN ALTER, NOT THE CREATE TABLE in v1.2:
-- alter_add_reporter_fields_v1.2.sql creates grant_reporter_project with
-- CREATE TABLE IF NOT EXISTS — a no-op once the table exists, so editing its
-- body would not add columns to a live table. This file uses the
-- information_schema-guarded ALTER idiom (cf. v1.1) so it is safe on a
-- populated prod/dev table. The two columns were also added to v1.2's
-- CREATE TABLE so a fresh build matches.
--
-- Safe to run on prod and dev. Idempotent (information_schema guard; no-op on
-- re-run). No AFTER clause — keeps ALGORITHM=INSTANT eligible.
--
-- Run BEFORE deploying the updated retrieveReporter.py, otherwise the project
-- INSERT will fail with "Unknown column" on the 2 new fields.
-- =============================================================================

-- -----------------------------------------------------------------------------
-- grant_reporter_project: + project_terms + pref_terms
-- -----------------------------------------------------------------------------

SET @db = DATABASE();

SET @sql = (SELECT IF(
(SELECT COUNT(*) FROM information_schema.columns
WHERE table_schema = @db AND table_name = 'grant_reporter_project'
AND column_name = 'project_terms') = 0,
'ALTER TABLE grant_reporter_project ADD COLUMN `project_terms` text DEFAULT NULL',
'SELECT ''grant_reporter_project.project_terms already exists'''));
PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;

SET @sql = (SELECT IF(
(SELECT COUNT(*) FROM information_schema.columns
WHERE table_schema = @db AND table_name = 'grant_reporter_project'
AND column_name = 'pref_terms') = 0,
'ALTER TABLE grant_reporter_project ADD COLUMN `pref_terms` text DEFAULT NULL',
'SELECT ''grant_reporter_project.pref_terms already exists'''));
PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;

-- -----------------------------------------------------------------------------
-- Verification
-- -----------------------------------------------------------------------------

SELECT table_name, column_name, data_type, is_nullable
FROM information_schema.columns
WHERE table_schema = DATABASE()
AND table_name = 'grant_reporter_project'
AND column_name IN ('project_terms', 'pref_terms')
ORDER BY ordinal_position;
Loading