wcmc-its · paulalbert1 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 27, 2023
diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,14 @@ update/*.log
 update/app.log
 update/retrieveNIH.log
 update/temp/
+retrieveNIH.log
+
+# One-shot audit / repair artifacts (contain prod abstract text; never commit)
+audit_abstracts.csv
+audit_abstracts_dump.txt
+invalid_pmids.txt
+invalid_pmids.sql
+reporting_abstracts_corrupt_backup_*.sql
 
 # Legacy ML models (unused)
 update/*.keras

diff --git a/Dockerfile b/Dockerfile
@@ -13,6 +13,7 @@ ENV PYTHONUNBUFFERED=1
 
 # Copy additional Python scripts
 COPY update/retrieveNIH.py ./
+COPY update/retrieveReporter.py ./
 COPY update/retrieveAltmetric.py ./
 COPY update/retrieveArticles.py ./
 COPY update/updateReciterDB.py ./

diff --git a/setup/alter_add_admin_user_scope_columns_v1.5.sql b/setup/alter_add_admin_user_scope_columns_v1.5.sql
@@ -0,0 +1,76 @@
+-- =============================================================================
+-- Migration: Add admin_users scope/proxy columns (v1.5)
+-- =============================================================================
+-- Adds the three JSON scope columns the Publication Manager AdminUser model
+-- now selects on every login:
+--   - scope_person_types   (JSON, nullable) — person-type curation scope
+--   - scope_org_units       (JSON, nullable) — org-unit curation scope
+--   - proxy_person_ids      (JSON, nullable) — proxied person identifiers
+--
+-- WHY THIS MIGRATION EXISTS:
+--   ReCiter-Publication-Manager (dev branch, model commit 579d32f
+--   "extend AdminUser model with scope/proxy JSON columns") issues
+--     SELECT userID, personIdentifier, ..., scope_person_types,
+--            scope_org_units, proxy_person_ids FROM admin_users
+--   inside findOrcreateAdminUser during authentication. If admin_users is
+--   missing these columns the SELECT fails with ER_BAD_FIELD_ERROR
+--   ("Unknown column 'scope_person_types'"), the authorize() call throws, and
+--   login returns 401 for every user. The columns must exist before the PM
+--   dev branch is deployed against this database.
+--
+--   The fresh-build schema (setup/createDatabaseTableReciterDb.sql on master,
+--   PR #92) already defines admin_users WITH these columns, so new databases
+--   are fine. This migration brings EXISTING databases (e.g. the production
+--   reciterdb, which predates #92 and has none of the three) up to that
+--   schema. There was no ALTER path for existing DBs until now.
+--
+-- DURABILITY: admin_users is curator state, not a reporting export. It is NOT
+--   in update/updateReciterDB.py's truncate list (`all_tables`) and is not
+--   touched by any nightly stored procedure or ETL step, so these columns
+--   persist across nightly reload.
+--
+-- Safe to run on prod and dev. Uses IF NOT EXISTS-style guards via an
+-- information_schema check (no-op on re-run). Additive only — no existing
+-- column or row is modified. Run BEFORE deploying the PM dev branch.
+-- =============================================================================
+
+-- -----------------------------------------------------------------------------
+-- admin_users: + scope_person_types + scope_org_units + proxy_person_ids
+-- -----------------------------------------------------------------------------
+
+SET @db = DATABASE();
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'admin_users'
+       AND column_name = 'scope_person_types') = 0,
+    'ALTER TABLE admin_users ADD COLUMN `scope_person_types` JSON DEFAULT NULL',
+    'SELECT ''admin_users.scope_person_types already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'admin_users'
+       AND column_name = 'scope_org_units') = 0,
+    'ALTER TABLE admin_users ADD COLUMN `scope_org_units` JSON DEFAULT NULL',
+    'SELECT ''admin_users.scope_org_units already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'admin_users'
+       AND column_name = 'proxy_person_ids') = 0,
+    'ALTER TABLE admin_users ADD COLUMN `proxy_person_ids` JSON DEFAULT NULL',
+    'SELECT ''admin_users.proxy_person_ids already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+-- -----------------------------------------------------------------------------
+-- Verification
+-- -----------------------------------------------------------------------------
+
+SELECT table_name, column_name, data_type, is_nullable
+FROM information_schema.columns
+WHERE table_schema = DATABASE()
+  AND table_name = 'admin_users'
+  AND column_name IN ('scope_person_types', 'scope_org_units', 'proxy_person_ids')
+ORDER BY ordinal_position;
diff --git a/setup/alter_add_feature_generator_fields_v1.1.sql b/setup/alter_add_feature_generator_fields_v1.1.sql
@@ -0,0 +1,80 @@
+-- =============================================================================
+-- Migration: Add 4 new Feature Generator fields (v1.1)
+-- =============================================================================
+-- Adds columns introduced by ReCiter Feature Generator:
+--   - datePublicationAddedToPMC          (top-level article field)
+--   - feedbackScoreTextSimilarity        (evidence.feedbackEvidence)
+--   - feedbackScoreJournalTitleSimilarity (evidence.feedbackEvidence)
+--   - feedbackScoreBibliographicCoupling  (evidence.feedbackEvidence)
+--
+-- Safe to run on prod and dev. Uses IF NOT EXISTS-style guards via
+-- information_schema check (no-op on re-run).
+--
+-- Run BEFORE deploying the updated Python ETL, otherwise LOAD DATA INFILE
+-- will fail with "Unknown column" on the 4 new headers.
+-- =============================================================================
+
+-- -----------------------------------------------------------------------------
+-- person_article: + datePublicationAddedToPMC + 3 feedback scores
+-- -----------------------------------------------------------------------------
+
+SET @db = DATABASE();
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'person_article'
+       AND column_name = 'datePublicationAddedToPMC') = 0,
+    'ALTER TABLE person_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL',
+    'SELECT ''person_article.datePublicationAddedToPMC already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'person_article'
+       AND column_name = 'feedbackScoreTextSimilarity') = 0,
+    'ALTER TABLE person_article ADD COLUMN `feedbackScoreTextSimilarity` float DEFAULT NULL',
+    'SELECT ''person_article.feedbackScoreTextSimilarity already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'person_article'
+       AND column_name = 'feedbackScoreJournalTitleSimilarity') = 0,
+    'ALTER TABLE person_article ADD COLUMN `feedbackScoreJournalTitleSimilarity` float DEFAULT NULL',
+    'SELECT ''person_article.feedbackScoreJournalTitleSimilarity already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'person_article'
+       AND column_name = 'feedbackScoreBibliographicCoupling') = 0,
+    'ALTER TABLE person_article ADD COLUMN `feedbackScoreBibliographicCoupling` float DEFAULT NULL',
+    'SELECT ''person_article.feedbackScoreBibliographicCoupling already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+-- -----------------------------------------------------------------------------
+-- analysis_summary_article: + datePublicationAddedToPMC
+-- (feedback scores NOT carried into summary — per-person-article only)
+-- -----------------------------------------------------------------------------
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'analysis_summary_article'
+       AND column_name = 'datePublicationAddedToPMC') = 0,
+    'ALTER TABLE analysis_summary_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL',
+    'SELECT ''analysis_summary_article.datePublicationAddedToPMC already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+-- -----------------------------------------------------------------------------
+-- Verification
+-- -----------------------------------------------------------------------------
+
+SELECT table_name, column_name, data_type, is_nullable
+FROM information_schema.columns
+WHERE table_schema = DATABASE()
+  AND column_name IN (
+    'datePublicationAddedToPMC',
+    'feedbackScoreTextSimilarity',
+    'feedbackScoreJournalTitleSimilarity',
+    'feedbackScoreBibliographicCoupling')
+ORDER BY table_name, ordinal_position;
diff --git a/setup/alter_add_reporter_fields_v1.2.sql b/setup/alter_add_reporter_fields_v1.2.sql
@@ -0,0 +1,134 @@
+-- =============================================================================
+-- Migration: NIH RePORTER integration (v1.2)
+-- =============================================================================
+-- Adds the tables needed to ingest pub-grant linkages and project metadata
+-- from NIH RePORTER (https://api.reporter.nih.gov/v2/) and to track per-pair
+-- provenance over time.
+--
+-- WHY SEPARATE TABLES (not columns on person_article_grant):
+--   person_article_grant is TRUNCATE-reloaded by updateReciterDB.py every
+--   night from ReCiter scoring output (see updateReciterDB.py:241). Any
+--   provenance columns added directly to that table would be wiped on each
+--   nightly run, defeating the purpose of *_first_seen tracking. The
+--   provenance table below is updated incrementally and survives reloads.
+--
+-- WHAT'S CREATED:
+--   1. grant_reporter_project — RePORTER /projects/search results
+--   2. grant_reporter_link    — RePORTER /publications/search results
+--   3. grant_provenance       — long-lived per-(person, pmid, grant)
+--                               source-and-timestamp log
+--
+-- Safe to run on prod and dev. Idempotent (CREATE TABLE IF NOT EXISTS).
+-- Run BEFORE deploying retrieveReporter.py.
+-- =============================================================================
+
+-- -----------------------------------------------------------------------------
+-- grant_reporter_project — RePORTER project metadata
+-- -----------------------------------------------------------------------------
+-- One row per RePORTER appl_id returned by /projects/search for the configured
+-- WCM org filter. Refreshed each ETL cycle (truncate-reload OK; no historical
+-- state to preserve here — RePORTER is the source of truth).
+--
+-- abstract_text is stored here as a cross-reference. The Funding UI reads
+-- abstracts from Postgres (Scholars-Profile-System) where they're joined to
+-- InfoEd grant rows; this column exists for ad-hoc analysis and future
+-- reciterdb-side consumers.
+--
+-- project_terms / pref_terms hold the NIH-curated keyword vocabulary RePORTER
+-- returns per project, stored raw (project_terms angle-bracket-wrapped,
+-- pref_terms semicolon-delimited). Added by alter_add_reporter_terms_v1.3.sql;
+-- mirrored into the CREATE TABLE here so a fresh build matches (issue #291).
+-- -----------------------------------------------------------------------------
+
+CREATE TABLE IF NOT EXISTS `grant_reporter_project` (
+  `appl_id` int(11) NOT NULL,
+  `core_project_num` varchar(32) DEFAULT NULL,
+  `project_title` varchar(512) DEFAULT NULL,
+  `org_name` varchar(255) DEFAULT NULL,
+  `fiscal_year` smallint(6) DEFAULT NULL,
+  `activity_code` varchar(8) DEFAULT NULL,
+  `project_start_date` date DEFAULT NULL,
+  `project_end_date` date DEFAULT NULL,
+  `abstract_text` mediumtext DEFAULT NULL,
+  `project_terms` text DEFAULT NULL,
+  `pref_terms` text DEFAULT NULL,
+  `last_fetched_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
+  PRIMARY KEY (`appl_id`),
+  KEY `core_project_num` (`core_project_num`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+-- -----------------------------------------------------------------------------
+-- grant_reporter_link — RePORTER pub-grant linkages
+-- -----------------------------------------------------------------------------
+-- One row per (pmid, appl_id) pair returned by /publications/search.
+-- Refreshed each ETL cycle (truncate-reload). The grant_provenance table
+-- below is what carries history.
+-- -----------------------------------------------------------------------------
+
+CREATE TABLE IF NOT EXISTS `grant_reporter_link` (
+  `id` int(11) NOT NULL AUTO_INCREMENT,
+  `pmid` int(11) NOT NULL,
+  `appl_id` int(11) NOT NULL,
+  `core_project_num` varchar(32) DEFAULT NULL,
+  `last_fetched_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
+  PRIMARY KEY (`id`),
+  UNIQUE KEY `uk_pmid_appl_id` (`pmid`, `appl_id`),
+  KEY `pmid` (`pmid`),
+  KEY `core_project_num` (`core_project_num`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+-- -----------------------------------------------------------------------------
+-- grant_provenance — per-(person, pmid, grant) source and timestamp log
+-- -----------------------------------------------------------------------------
+-- The audit log that survives nightly truncate-reload of person_article_grant.
+-- Keyed by (personIdentifier, pmid, core_project_num) where core_project_num
+-- is the normalized NIH grant identifier (e.g. "R01DK127777" — no year suffix,
+-- no spaces). For non-NIH grants the original articleGrant string is stored
+-- in core_project_num as a fallback so the row is still keyable.
+--
+-- Update logic (run nightly by retrieveReporter.py after person_article_grant
+-- has been refreshed by retrieveArticles.py):
+--
+--   1. UPSERT from person_article_grant: any (personIdentifier, pmid,
+--      normalized_grant) currently in person_article_grant gets
+--      source_reciterdb=1 and last_verified=NOW(). reciterdb_first_seen is
+--      set on first insert and never overwritten.
+--
+--   2. UPSERT from grant_reporter_link joined to person_article (where
+--      userAssertion='ACCEPTED' to scope to confirmed WCM authors): any
+--      (personIdentifier, pmid, core_project_num) seen in RePORTER gets
+--      source_reporter=1 and last_verified=NOW(). reporter_first_seen is
+--      set on first insert and never overwritten.
+--
+-- Subaward caution: see retrieveReporter.py — we filter RePORTER projects
+-- to org_names=["WEILL MEDICAL COLL OF CORNELL UNIV"] and join PMIDs to
+-- person_article ACCEPTED rows. This minimizes false positives at the cost
+-- of missing some legitimate WCM-as-subaward linkages.
+-- -----------------------------------------------------------------------------
+
+CREATE TABLE IF NOT EXISTS `grant_provenance` (
+  `id` int(11) NOT NULL AUTO_INCREMENT,
+  `personIdentifier` varchar(128) NOT NULL,
+  `pmid` int(11) NOT NULL,
+  `core_project_num` varchar(64) NOT NULL,
+  `appl_id` int(11) DEFAULT NULL,
+  `source_reporter` tinyint(1) NOT NULL DEFAULT 0,
+  `source_reciterdb` tinyint(1) NOT NULL DEFAULT 0,
+  `reporter_first_seen` datetime DEFAULT NULL,
+  `reciterdb_first_seen` datetime DEFAULT NULL,
+  `last_verified` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
+  PRIMARY KEY (`id`),
+  UNIQUE KEY `uk_person_pmid_grant` (`personIdentifier`, `pmid`, `core_project_num`),
+  KEY `pmid` (`pmid`),
+  KEY `appl_id` (`appl_id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+-- -----------------------------------------------------------------------------
+-- Verification
+-- -----------------------------------------------------------------------------
+
+SELECT table_name, table_rows, create_time
+FROM information_schema.tables
+WHERE table_schema = DATABASE()
+  AND table_name IN ('grant_reporter_project', 'grant_reporter_link', 'grant_provenance')
+ORDER BY table_name;
diff --git a/setup/alter_add_reporter_terms_v1.3.sql b/setup/alter_add_reporter_terms_v1.3.sql
@@ -0,0 +1,59 @@
+-- =============================================================================
+-- Migration: NIH RePORTER project terms (v1.3)
+-- =============================================================================
+-- Adds two columns to grant_reporter_project for the NIH-curated keyword
+-- vocabulary RePORTER returns alongside the abstract:
+--   - project_terms — RePORTER `terms`, angle-bracket-wrapped (<a><b><c>)
+--   - pref_terms    — RePORTER `pref_terms`, semicolon-delimited (a;b;c)
+--
+-- Stored raw, verbatim from the API. Parsing into a keyword array happens
+-- downstream in the Scholars-Profile-System ETL (issue #291); reciterdb keeps
+-- the unparsed strings so a future reciterdb-side consumer can re-parse.
+--
+-- WHY AN ALTER, NOT THE CREATE TABLE in v1.2:
+--   alter_add_reporter_fields_v1.2.sql creates grant_reporter_project with
+--   CREATE TABLE IF NOT EXISTS — a no-op once the table exists, so editing its
+--   body would not add columns to a live table. This file uses the
+--   information_schema-guarded ALTER idiom (cf. v1.1) so it is safe on a
+--   populated prod/dev table. The two columns were also added to v1.2's
+--   CREATE TABLE so a fresh build matches.
+--
+-- Safe to run on prod and dev. Idempotent (information_schema guard; no-op on
+-- re-run). No AFTER clause — keeps ALGORITHM=INSTANT eligible.
+--
+-- Run BEFORE deploying the updated retrieveReporter.py, otherwise the project
+-- INSERT will fail with "Unknown column" on the 2 new fields.
+-- =============================================================================
+
+-- -----------------------------------------------------------------------------
+-- grant_reporter_project: + project_terms + pref_terms
+-- -----------------------------------------------------------------------------
+
+SET @db = DATABASE();
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'grant_reporter_project'
+       AND column_name = 'project_terms') = 0,
+    'ALTER TABLE grant_reporter_project ADD COLUMN `project_terms` text DEFAULT NULL',
+    'SELECT ''grant_reporter_project.project_terms already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'grant_reporter_project'
+       AND column_name = 'pref_terms') = 0,
+    'ALTER TABLE grant_reporter_project ADD COLUMN `pref_terms` text DEFAULT NULL',
+    'SELECT ''grant_reporter_project.pref_terms already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+-- -----------------------------------------------------------------------------
+-- Verification
+-- -----------------------------------------------------------------------------
+
+SELECT table_name, column_name, data_type, is_nullable
+FROM information_schema.columns
+WHERE table_schema = DATABASE()
+  AND table_name = 'grant_reporter_project'
+  AND column_name IN ('project_terms', 'pref_terms')
+ORDER BY ordinal_position;