diff --git a/.gitignore b/.gitignore index 0f6ba4c..b2dd5b2 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,14 @@ update/*.log update/app.log update/retrieveNIH.log update/temp/ +retrieveNIH.log + +# One-shot audit / repair artifacts (contain prod abstract text; never commit) +audit_abstracts.csv +audit_abstracts_dump.txt +invalid_pmids.txt +invalid_pmids.sql +reporting_abstracts_corrupt_backup_*.sql # Legacy ML models (unused) update/*.keras diff --git a/Dockerfile b/Dockerfile index 8501d0b..f4f344b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,6 +13,7 @@ ENV PYTHONUNBUFFERED=1 # Copy additional Python scripts COPY update/retrieveNIH.py ./ +COPY update/retrieveReporter.py ./ COPY update/retrieveAltmetric.py ./ COPY update/retrieveArticles.py ./ COPY update/updateReciterDB.py ./ diff --git a/setup/alter_add_admin_user_scope_columns_v1.5.sql b/setup/alter_add_admin_user_scope_columns_v1.5.sql new file mode 100644 index 0000000..98e4877 --- /dev/null +++ b/setup/alter_add_admin_user_scope_columns_v1.5.sql @@ -0,0 +1,76 @@ +-- ============================================================================= +-- Migration: Add admin_users scope/proxy columns (v1.5) +-- ============================================================================= +-- Adds the three JSON scope columns the Publication Manager AdminUser model +-- now selects on every login: +-- - scope_person_types (JSON, nullable) — person-type curation scope +-- - scope_org_units (JSON, nullable) — org-unit curation scope +-- - proxy_person_ids (JSON, nullable) — proxied person identifiers +-- +-- WHY THIS MIGRATION EXISTS: +-- ReCiter-Publication-Manager (dev branch, model commit 579d32f +-- "extend AdminUser model with scope/proxy JSON columns") issues +-- SELECT userID, personIdentifier, ..., scope_person_types, +-- scope_org_units, proxy_person_ids FROM admin_users +-- inside findOrcreateAdminUser during authentication. If admin_users is +-- missing these columns the SELECT fails with ER_BAD_FIELD_ERROR +-- ("Unknown column 'scope_person_types'"), the authorize() call throws, and +-- login returns 401 for every user. The columns must exist before the PM +-- dev branch is deployed against this database. +-- +-- The fresh-build schema (setup/createDatabaseTableReciterDb.sql on master, +-- PR #92) already defines admin_users WITH these columns, so new databases +-- are fine. This migration brings EXISTING databases (e.g. the production +-- reciterdb, which predates #92 and has none of the three) up to that +-- schema. There was no ALTER path for existing DBs until now. +-- +-- DURABILITY: admin_users is curator state, not a reporting export. It is NOT +-- in update/updateReciterDB.py's truncate list (`all_tables`) and is not +-- touched by any nightly stored procedure or ETL step, so these columns +-- persist across nightly reload. +-- +-- Safe to run on prod and dev. Uses IF NOT EXISTS-style guards via an +-- information_schema check (no-op on re-run). Additive only — no existing +-- column or row is modified. Run BEFORE deploying the PM dev branch. +-- ============================================================================= + +-- ----------------------------------------------------------------------------- +-- admin_users: + scope_person_types + scope_org_units + proxy_person_ids +-- ----------------------------------------------------------------------------- + +SET @db = DATABASE(); + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'admin_users' + AND column_name = 'scope_person_types') = 0, + 'ALTER TABLE admin_users ADD COLUMN `scope_person_types` JSON DEFAULT NULL', + 'SELECT ''admin_users.scope_person_types already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'admin_users' + AND column_name = 'scope_org_units') = 0, + 'ALTER TABLE admin_users ADD COLUMN `scope_org_units` JSON DEFAULT NULL', + 'SELECT ''admin_users.scope_org_units already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'admin_users' + AND column_name = 'proxy_person_ids') = 0, + 'ALTER TABLE admin_users ADD COLUMN `proxy_person_ids` JSON DEFAULT NULL', + 'SELECT ''admin_users.proxy_person_ids already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +-- ----------------------------------------------------------------------------- +-- Verification +-- ----------------------------------------------------------------------------- + +SELECT table_name, column_name, data_type, is_nullable +FROM information_schema.columns +WHERE table_schema = DATABASE() + AND table_name = 'admin_users' + AND column_name IN ('scope_person_types', 'scope_org_units', 'proxy_person_ids') +ORDER BY ordinal_position; diff --git a/setup/alter_add_feature_generator_fields_v1.1.sql b/setup/alter_add_feature_generator_fields_v1.1.sql new file mode 100644 index 0000000..fb4c8b7 --- /dev/null +++ b/setup/alter_add_feature_generator_fields_v1.1.sql @@ -0,0 +1,80 @@ +-- ============================================================================= +-- Migration: Add 4 new Feature Generator fields (v1.1) +-- ============================================================================= +-- Adds columns introduced by ReCiter Feature Generator: +-- - datePublicationAddedToPMC (top-level article field) +-- - feedbackScoreTextSimilarity (evidence.feedbackEvidence) +-- - feedbackScoreJournalTitleSimilarity (evidence.feedbackEvidence) +-- - feedbackScoreBibliographicCoupling (evidence.feedbackEvidence) +-- +-- Safe to run on prod and dev. Uses IF NOT EXISTS-style guards via +-- information_schema check (no-op on re-run). +-- +-- Run BEFORE deploying the updated Python ETL, otherwise LOAD DATA INFILE +-- will fail with "Unknown column" on the 4 new headers. +-- ============================================================================= + +-- ----------------------------------------------------------------------------- +-- person_article: + datePublicationAddedToPMC + 3 feedback scores +-- ----------------------------------------------------------------------------- + +SET @db = DATABASE(); + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'person_article' + AND column_name = 'datePublicationAddedToPMC') = 0, + 'ALTER TABLE person_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL', + 'SELECT ''person_article.datePublicationAddedToPMC already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'person_article' + AND column_name = 'feedbackScoreTextSimilarity') = 0, + 'ALTER TABLE person_article ADD COLUMN `feedbackScoreTextSimilarity` float DEFAULT NULL', + 'SELECT ''person_article.feedbackScoreTextSimilarity already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'person_article' + AND column_name = 'feedbackScoreJournalTitleSimilarity') = 0, + 'ALTER TABLE person_article ADD COLUMN `feedbackScoreJournalTitleSimilarity` float DEFAULT NULL', + 'SELECT ''person_article.feedbackScoreJournalTitleSimilarity already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'person_article' + AND column_name = 'feedbackScoreBibliographicCoupling') = 0, + 'ALTER TABLE person_article ADD COLUMN `feedbackScoreBibliographicCoupling` float DEFAULT NULL', + 'SELECT ''person_article.feedbackScoreBibliographicCoupling already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +-- ----------------------------------------------------------------------------- +-- analysis_summary_article: + datePublicationAddedToPMC +-- (feedback scores NOT carried into summary — per-person-article only) +-- ----------------------------------------------------------------------------- + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'analysis_summary_article' + AND column_name = 'datePublicationAddedToPMC') = 0, + 'ALTER TABLE analysis_summary_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL', + 'SELECT ''analysis_summary_article.datePublicationAddedToPMC already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +-- ----------------------------------------------------------------------------- +-- Verification +-- ----------------------------------------------------------------------------- + +SELECT table_name, column_name, data_type, is_nullable +FROM information_schema.columns +WHERE table_schema = DATABASE() + AND column_name IN ( + 'datePublicationAddedToPMC', + 'feedbackScoreTextSimilarity', + 'feedbackScoreJournalTitleSimilarity', + 'feedbackScoreBibliographicCoupling') +ORDER BY table_name, ordinal_position; diff --git a/setup/alter_add_reporter_fields_v1.2.sql b/setup/alter_add_reporter_fields_v1.2.sql new file mode 100644 index 0000000..39c36bb --- /dev/null +++ b/setup/alter_add_reporter_fields_v1.2.sql @@ -0,0 +1,134 @@ +-- ============================================================================= +-- Migration: NIH RePORTER integration (v1.2) +-- ============================================================================= +-- Adds the tables needed to ingest pub-grant linkages and project metadata +-- from NIH RePORTER (https://api.reporter.nih.gov/v2/) and to track per-pair +-- provenance over time. +-- +-- WHY SEPARATE TABLES (not columns on person_article_grant): +-- person_article_grant is TRUNCATE-reloaded by updateReciterDB.py every +-- night from ReCiter scoring output (see updateReciterDB.py:241). Any +-- provenance columns added directly to that table would be wiped on each +-- nightly run, defeating the purpose of *_first_seen tracking. The +-- provenance table below is updated incrementally and survives reloads. +-- +-- WHAT'S CREATED: +-- 1. grant_reporter_project — RePORTER /projects/search results +-- 2. grant_reporter_link — RePORTER /publications/search results +-- 3. grant_provenance — long-lived per-(person, pmid, grant) +-- source-and-timestamp log +-- +-- Safe to run on prod and dev. Idempotent (CREATE TABLE IF NOT EXISTS). +-- Run BEFORE deploying retrieveReporter.py. +-- ============================================================================= + +-- ----------------------------------------------------------------------------- +-- grant_reporter_project — RePORTER project metadata +-- ----------------------------------------------------------------------------- +-- One row per RePORTER appl_id returned by /projects/search for the configured +-- WCM org filter. Refreshed each ETL cycle (truncate-reload OK; no historical +-- state to preserve here — RePORTER is the source of truth). +-- +-- abstract_text is stored here as a cross-reference. The Funding UI reads +-- abstracts from Postgres (Scholars-Profile-System) where they're joined to +-- InfoEd grant rows; this column exists for ad-hoc analysis and future +-- reciterdb-side consumers. +-- +-- project_terms / pref_terms hold the NIH-curated keyword vocabulary RePORTER +-- returns per project, stored raw (project_terms angle-bracket-wrapped, +-- pref_terms semicolon-delimited). Added by alter_add_reporter_terms_v1.3.sql; +-- mirrored into the CREATE TABLE here so a fresh build matches (issue #291). +-- ----------------------------------------------------------------------------- + +CREATE TABLE IF NOT EXISTS `grant_reporter_project` ( + `appl_id` int(11) NOT NULL, + `core_project_num` varchar(32) DEFAULT NULL, + `project_title` varchar(512) DEFAULT NULL, + `org_name` varchar(255) DEFAULT NULL, + `fiscal_year` smallint(6) DEFAULT NULL, + `activity_code` varchar(8) DEFAULT NULL, + `project_start_date` date DEFAULT NULL, + `project_end_date` date DEFAULT NULL, + `abstract_text` mediumtext DEFAULT NULL, + `project_terms` text DEFAULT NULL, + `pref_terms` text DEFAULT NULL, + `last_fetched_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (`appl_id`), + KEY `core_project_num` (`core_project_num`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- ----------------------------------------------------------------------------- +-- grant_reporter_link — RePORTER pub-grant linkages +-- ----------------------------------------------------------------------------- +-- One row per (pmid, appl_id) pair returned by /publications/search. +-- Refreshed each ETL cycle (truncate-reload). The grant_provenance table +-- below is what carries history. +-- ----------------------------------------------------------------------------- + +CREATE TABLE IF NOT EXISTS `grant_reporter_link` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `pmid` int(11) NOT NULL, + `appl_id` int(11) NOT NULL, + `core_project_num` varchar(32) DEFAULT NULL, + `last_fetched_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (`id`), + UNIQUE KEY `uk_pmid_appl_id` (`pmid`, `appl_id`), + KEY `pmid` (`pmid`), + KEY `core_project_num` (`core_project_num`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- ----------------------------------------------------------------------------- +-- grant_provenance — per-(person, pmid, grant) source and timestamp log +-- ----------------------------------------------------------------------------- +-- The audit log that survives nightly truncate-reload of person_article_grant. +-- Keyed by (personIdentifier, pmid, core_project_num) where core_project_num +-- is the normalized NIH grant identifier (e.g. "R01DK127777" — no year suffix, +-- no spaces). For non-NIH grants the original articleGrant string is stored +-- in core_project_num as a fallback so the row is still keyable. +-- +-- Update logic (run nightly by retrieveReporter.py after person_article_grant +-- has been refreshed by retrieveArticles.py): +-- +-- 1. UPSERT from person_article_grant: any (personIdentifier, pmid, +-- normalized_grant) currently in person_article_grant gets +-- source_reciterdb=1 and last_verified=NOW(). reciterdb_first_seen is +-- set on first insert and never overwritten. +-- +-- 2. UPSERT from grant_reporter_link joined to person_article (where +-- userAssertion='ACCEPTED' to scope to confirmed WCM authors): any +-- (personIdentifier, pmid, core_project_num) seen in RePORTER gets +-- source_reporter=1 and last_verified=NOW(). reporter_first_seen is +-- set on first insert and never overwritten. +-- +-- Subaward caution: see retrieveReporter.py — we filter RePORTER projects +-- to org_names=["WEILL MEDICAL COLL OF CORNELL UNIV"] and join PMIDs to +-- person_article ACCEPTED rows. This minimizes false positives at the cost +-- of missing some legitimate WCM-as-subaward linkages. +-- ----------------------------------------------------------------------------- + +CREATE TABLE IF NOT EXISTS `grant_provenance` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `personIdentifier` varchar(128) NOT NULL, + `pmid` int(11) NOT NULL, + `core_project_num` varchar(64) NOT NULL, + `appl_id` int(11) DEFAULT NULL, + `source_reporter` tinyint(1) NOT NULL DEFAULT 0, + `source_reciterdb` tinyint(1) NOT NULL DEFAULT 0, + `reporter_first_seen` datetime DEFAULT NULL, + `reciterdb_first_seen` datetime DEFAULT NULL, + `last_verified` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (`id`), + UNIQUE KEY `uk_person_pmid_grant` (`personIdentifier`, `pmid`, `core_project_num`), + KEY `pmid` (`pmid`), + KEY `appl_id` (`appl_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- ----------------------------------------------------------------------------- +-- Verification +-- ----------------------------------------------------------------------------- + +SELECT table_name, table_rows, create_time +FROM information_schema.tables +WHERE table_schema = DATABASE() + AND table_name IN ('grant_reporter_project', 'grant_reporter_link', 'grant_provenance') +ORDER BY table_name; diff --git a/setup/alter_add_reporter_terms_v1.3.sql b/setup/alter_add_reporter_terms_v1.3.sql new file mode 100644 index 0000000..0b4d634 --- /dev/null +++ b/setup/alter_add_reporter_terms_v1.3.sql @@ -0,0 +1,59 @@ +-- ============================================================================= +-- Migration: NIH RePORTER project terms (v1.3) +-- ============================================================================= +-- Adds two columns to grant_reporter_project for the NIH-curated keyword +-- vocabulary RePORTER returns alongside the abstract: +-- - project_terms — RePORTER `terms`, angle-bracket-wrapped () +-- - pref_terms — RePORTER `pref_terms`, semicolon-delimited (a;b;c) +-- +-- Stored raw, verbatim from the API. Parsing into a keyword array happens +-- downstream in the Scholars-Profile-System ETL (issue #291); reciterdb keeps +-- the unparsed strings so a future reciterdb-side consumer can re-parse. +-- +-- WHY AN ALTER, NOT THE CREATE TABLE in v1.2: +-- alter_add_reporter_fields_v1.2.sql creates grant_reporter_project with +-- CREATE TABLE IF NOT EXISTS — a no-op once the table exists, so editing its +-- body would not add columns to a live table. This file uses the +-- information_schema-guarded ALTER idiom (cf. v1.1) so it is safe on a +-- populated prod/dev table. The two columns were also added to v1.2's +-- CREATE TABLE so a fresh build matches. +-- +-- Safe to run on prod and dev. Idempotent (information_schema guard; no-op on +-- re-run). No AFTER clause — keeps ALGORITHM=INSTANT eligible. +-- +-- Run BEFORE deploying the updated retrieveReporter.py, otherwise the project +-- INSERT will fail with "Unknown column" on the 2 new fields. +-- ============================================================================= + +-- ----------------------------------------------------------------------------- +-- grant_reporter_project: + project_terms + pref_terms +-- ----------------------------------------------------------------------------- + +SET @db = DATABASE(); + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'grant_reporter_project' + AND column_name = 'project_terms') = 0, + 'ALTER TABLE grant_reporter_project ADD COLUMN `project_terms` text DEFAULT NULL', + 'SELECT ''grant_reporter_project.project_terms already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'grant_reporter_project' + AND column_name = 'pref_terms') = 0, + 'ALTER TABLE grant_reporter_project ADD COLUMN `pref_terms` text DEFAULT NULL', + 'SELECT ''grant_reporter_project.pref_terms already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +-- ----------------------------------------------------------------------------- +-- Verification +-- ----------------------------------------------------------------------------- + +SELECT table_name, column_name, data_type, is_nullable +FROM information_schema.columns +WHERE table_schema = DATABASE() + AND table_name = 'grant_reporter_project' + AND column_name IN ('project_terms', 'pref_terms') +ORDER BY ordinal_position; diff --git a/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql b/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql new file mode 100644 index 0000000..abff544 --- /dev/null +++ b/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql @@ -0,0 +1,88 @@ +-- ============================================================================= +-- Migration: UNIQUE KEY on reporting_abstracts.pmid (v1.4) +-- ============================================================================= +-- Replaces the existing non-unique `idx_pmid` index on reporting_abstracts +-- with a UNIQUE KEY so the parser-desync class of failure that corrupted +-- ~3,100 rows historically (issue #87, pre-PR #78 CSV / LOAD DATA path) can +-- no longer silently produce duplicate-pmid rows. +-- +-- WHY THIS IS NEEDED: +-- update/abstractImport.py's fetch_missing_pmids() uses +-- LEFT JOIN reporting_abstracts a ON a.pmid = p.pmid WHERE a.pmid IS NULL +-- so the import path *assumes* one-row-per-pmid. The schema never +-- enforced it. This migration codifies the assumption, mirroring the +-- analysis_nih fix from March (PR #71/#72 after the Dec 2025 duplicate +-- loading incident). +-- +-- PRECONDITION: +-- reporting_abstracts must contain zero duplicate pmids. The +-- information_schema-guarded block at the top aborts the migration with a +-- readable error if duplicates remain (run update/repairAbstracts.py +-- first; it warns when duplicates are present). +-- +-- Safe to run on prod and dev. Idempotent (information_schema guard; +-- re-runs are no-ops once the UNIQUE KEY exists). No AFTER clause; the +-- ALTER converts the existing BTREE index in place. +-- ============================================================================= + +SET @db = DATABASE(); + +-- ----------------------------------------------------------------------------- +-- Precondition: no duplicate pmids. +-- +-- If duplicates exist, the precondition synthesizes a SELECT against a +-- non-existent table whose name encodes the duplicate count. The resulting +-- "Table doesn't exist" error halts execution (SELECT-with-1/0 only emits +-- a warning, which MariaDB ignored outside a stored program in v1 of this +-- migration -- the cleanup was attempted, ALTER ran anyway, ALTER failed +-- on the first duplicate pmid). +-- ----------------------------------------------------------------------------- + +SET @dup_count = ( + SELECT COUNT(*) FROM ( + SELECT pmid FROM reporting_abstracts + GROUP BY pmid HAVING COUNT(*) > 1 + ) d +); + +SET @sql = IF( + @dup_count > 0, + CONCAT( + 'SELECT 1 FROM `__migration_aborted_reporting_abstracts_has_', + @dup_count, + '_duplicate_pmids__run_update_repairAbstracts_py_first`' + ), + 'SELECT ''No duplicate pmids; precondition satisfied.'' AS status' +); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +-- ----------------------------------------------------------------------------- +-- reporting_abstracts.idx_pmid: KEY -> UNIQUE KEY +-- ----------------------------------------------------------------------------- + +SET @already_unique = ( + SELECT COUNT(*) FROM information_schema.statistics + WHERE table_schema = @db + AND table_name = 'reporting_abstracts' + AND index_name = 'idx_pmid' + AND non_unique = 0 +); + +SET @sql = IF( + @already_unique > 0, + 'SELECT ''reporting_abstracts.idx_pmid is already UNIQUE; no-op.''', + 'ALTER TABLE reporting_abstracts + DROP INDEX idx_pmid, + ADD UNIQUE KEY idx_pmid (pmid) USING BTREE' +); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +-- ----------------------------------------------------------------------------- +-- Verification +-- ----------------------------------------------------------------------------- + +SELECT table_name, index_name, non_unique, column_name, index_type +FROM information_schema.statistics +WHERE table_schema = DATABASE() + AND table_name = 'reporting_abstracts' + AND index_name = 'idx_pmid'; diff --git a/setup/createDatabaseTableReciterDb.sql b/setup/createDatabaseTableReciterDb.sql index e94efd2..ea3bd8e 100644 --- a/setup/createDatabaseTableReciterDb.sql +++ b/setup/createDatabaseTableReciterDb.sql @@ -48,6 +48,7 @@ CREATE TABLE IF NOT EXISTS `admin_feedback_log` ( `personIdentifier` varchar(20) DEFAULT NULL, `articleIdentifier` int(11) DEFAULT NULL, `feedback` varchar(11) DEFAULT NULL, + `impersonatedByUserID` int(11) DEFAULT NULL, `createTimestamp` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', `modifyTimestamp` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), PRIMARY KEY (`feedbackID`), @@ -96,6 +97,49 @@ CREATE TABLE IF NOT EXISTS `admin_roles` ( PRIMARY KEY (`roleID`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +-- Data-driven RBAC permission tables. +-- Mirror of ReCiter-Publication-Manager scripts/migrations/add-permission-tables.sql +-- (3-places rule). Seed data lives in setup/table_admin_permissions.sql. +CREATE TABLE IF NOT EXISTS `admin_permissions` ( + `permissionID` int(11) NOT NULL AUTO_INCREMENT, + `permissionKey` varchar(128) NOT NULL, + `label` varchar(255) NOT NULL, + `description` text DEFAULT NULL, + `category` varchar(64) NOT NULL, + `createTimestamp` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', + `modifyTimestamp` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), + PRIMARY KEY (`permissionID`), + UNIQUE KEY `uq_permissionKey` (`permissionKey`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE IF NOT EXISTS `admin_role_permissions` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `roleID` int(11) NOT NULL, + `permissionID` int(11) NOT NULL, + `createTimestamp` timestamp NOT NULL DEFAULT current_timestamp(), + PRIMARY KEY (`id`), + UNIQUE KEY `uq_role_permission` (`roleID`,`permissionID`), + KEY `idx_roleID` (`roleID`), + KEY `idx_permissionID` (`permissionID`), + CONSTRAINT `fk_rp_role` FOREIGN KEY (`roleID`) REFERENCES `admin_roles` (`roleID`) ON DELETE CASCADE, + CONSTRAINT `fk_rp_permission` FOREIGN KEY (`permissionID`) REFERENCES `admin_permissions` (`permissionID`) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE IF NOT EXISTS `admin_permission_resources` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `permissionID` int(11) NOT NULL, + `resourceType` varchar(32) NOT NULL, + `resourceKey` varchar(128) NOT NULL, + `displayOrder` int(11) NOT NULL DEFAULT 0, + `icon` varchar(64) DEFAULT NULL, + `label` varchar(255) NOT NULL, + `route` varchar(255) DEFAULT NULL, + `createTimestamp` timestamp NOT NULL DEFAULT current_timestamp(), + PRIMARY KEY (`id`), + KEY `idx_pr_permissionID` (`permissionID`), + CONSTRAINT `fk_pr_permission` FOREIGN KEY (`permissionID`) REFERENCES `admin_permissions` (`permissionID`) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + CREATE TABLE IF NOT EXISTS `admin_settings` ( `viewName` varchar(200) NOT NULL, `viewAttributes` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_bin DEFAULT NULL CHECK (json_valid(`viewAttributes`)), @@ -260,7 +304,7 @@ CREATE TABLE IF NOT EXISTS `analysis_nih` ( `x_coord` float(5,4) DEFAULT NULL, `y_coord` float(5,4) DEFAULT NULL, PRIMARY KEY (`id`), - KEY `idx_pmid` (`pmid`) USING BTREE + UNIQUE KEY `idx_pmid` (`pmid`) USING BTREE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; CREATE TABLE IF NOT EXISTS `analysis_nih_cites` ( @@ -311,6 +355,7 @@ CREATE TABLE IF NOT EXISTS `analysis_summary_article` ( `publicationDateDisplay` varchar(200) DEFAULT NULL, `publicationDateStandardized` varchar(128) DEFAULT NULL, `datePublicationAddedToEntrez` varchar(128) DEFAULT NULL, + `datePublicationAddedToPMC` varchar(128) DEFAULT NULL, `articleTitle` varchar(1000) DEFAULT NULL, `articleTitleRTF` varchar(2000) DEFAULT NULL, `publicationTypeCanonical` varchar(128) DEFAULT NULL, @@ -654,6 +699,7 @@ CREATE TABLE IF NOT EXISTS `person_article` ( `scopusNonTargetAuthorInstitutionalAffiliationSource` varchar(128) DEFAULT NULL, `scopusNonTargetAuthorInstitutionalAffiliationScore` float DEFAULT 0, `datePublicationAddedToEntrez` varchar(128) DEFAULT NULL, + `datePublicationAddedToPMC` varchar(128) DEFAULT NULL, `doi` varchar(128) DEFAULT NULL, `issn` varchar(128) DEFAULT NULL, `issue` varchar(500) DEFAULT 'NULL', @@ -673,6 +719,9 @@ CREATE TABLE IF NOT EXISTS `person_article` ( `feedbackScoreOrganization` float DEFAULT NULL, `feedbackScoreTargetAuthorName` float DEFAULT NULL, `feedbackScoreYear` float DEFAULT NULL, + `feedbackScoreTextSimilarity` float DEFAULT NULL, + `feedbackScoreJournalTitleSimilarity` float DEFAULT NULL, + `feedbackScoreBibliographicCoupling` float DEFAULT NULL, `totalArticleScoreStandardized` int(11) DEFAULT NULL, `totalArticleScoreNonStandardized` float DEFAULT NULL, `targetAuthorCount` int(11) DEFAULT NULL, @@ -796,7 +845,7 @@ CREATE TABLE IF NOT EXISTS `reporting_abstracts` ( `abstract` blob DEFAULT NULL, `abstractVarchar` varchar(15000) DEFAULT NULL, PRIMARY KEY (`id`), - KEY `idx_pmid` (`pmid`) USING BTREE + UNIQUE KEY `idx_pmid` (`pmid`) USING BTREE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; CREATE TABLE IF NOT EXISTS `reporting_ad_hoc_feature_generator_execution` ( diff --git a/setup/createEventsProceduresReciterDb.sql b/setup/createEventsProceduresReciterDb.sql index 7891ea1..c8a05f5 100644 --- a/setup/createEventsProceduresReciterDb.sql +++ b/setup/createEventsProceduresReciterDb.sql @@ -2455,10 +2455,10 @@ order by pmid desc, rank asc; #### 3. Populate "analysis_summary_article" table with articles #### -insert into analysis_summary_article (pmid, pmcid, publicationTypeCanonical, articleYear, publicationDateStandardized, publicationDateDisplay, datePublicationAddedToEntrez, articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, citationCountScopus) -select distinct -pmid, max(pmcid), publicationTypeCanonical, articleYear, min(publicationDateStandardized), publicationDateDisplay, datePublicationAddedToEntrez, articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, max(timesCited) -from person_article +insert into analysis_summary_article (pmid, pmcid, publicationTypeCanonical, articleYear, publicationDateStandardized, publicationDateDisplay, datePublicationAddedToEntrez, datePublicationAddedToPMC, articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, citationCountScopus) +select distinct +pmid, max(pmcid), publicationTypeCanonical, articleYear, min(publicationDateStandardized), publicationDateDisplay, datePublicationAddedToEntrez, max(datePublicationAddedToPMC), articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, max(timesCited) +from person_article where userAssertion = 'ACCEPTED' group by pmid order by datePublicationAddedToEntrez desc; @@ -3584,17 +3584,19 @@ proc_main: BEGIN INSERT INTO analysis_summary_article_new ( pmid, pmcid, publicationTypeCanonical, articleYear, publicationDateStandardized, publicationDateDisplay, - datePublicationAddedToEntrez, articleTitle, journalTitleVerbose, + datePublicationAddedToEntrez, datePublicationAddedToPMC, + articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, citationCountScopus ) SELECT DISTINCT pmid, MAX(pmcid), publicationTypeCanonical, - articleYear, + IF(articleYear != 0, articleYear, LEFT(publicationDateStandardized, 4)), MIN(publicationDateStandardized), publicationDateDisplay, datePublicationAddedToEntrez, + MAX(datePublicationAddedToPMC), articleTitle, journalTitleVerbose, issn, diff --git a/setup/insertBaselineDataReciterDb.sql b/setup/insertBaselineDataReciterDb.sql deleted file mode 100644 index 081b1c3..0000000 --- a/setup/insertBaselineDataReciterDb.sql +++ /dev/null @@ -1,353 +0,0 @@ -SET FOREIGN_KEY_CHECKS = 0; -TRUNCATE `admin_roles`; - -LOCK TABLES `admin_roles` WRITE; -INSERT INTO `admin_roles` (`roleID`, `roleLabel`) VALUES - (1,'Superuser'), - (2,'Curator_All'), - (3,'Reporter_All'), - (4,'Curator_Self'), - (5,'Curator_Department'), - (6,'Curator_Department_Delegate'); -UNLOCK TABLES; - -TRUNCATE `analysis_special_characters`; - -LOCK TABLES `analysis_special_characters` WRITE; -TRUNCATE `analysis_special_characters`; -INSERT INTO `analysis_special_characters` (`id`, `specialCharacter`, `RTFescape`, `characterName`) VALUES - (1,'͵','\\\'82','Low left single quote'), - (2,'ƒ','\\\'83','Florin'), - (3,'„','\\\'84','Low left double quote'), - (4,'…','\\\'85','Ellipsis'), - (5,'†','\\\'86','Dagger'), - (6,'‡','\\\'87','Double dagger'), - (7,'∘','\\\'88','Circumflex'), - (8,'‰','\\\'89','Permil'), - (9,'Š','\\\'8a','S-caron'), - (10,'‹','\\\'8b','Single left guillemet'), - (11,'Œ','\\\'8c','OE-ligature'), - (12,'Ž','\\\'8e','Z-caron'), - (13,'‘','\\\'91','Left single quote'), - (14,'’','\\\'92','Right single quote'), - (15,'“','\\\'93','Left double quote'), - (16,'”','\\\'94','Right double quote'), - (17,'•','\\\'95','Bullet'), - (18,'–','\\\'96','En dash'), - (19,'—','\\\'97','Em dash'), - (20,'~','\\\'98','Tilde'), - (21,'™','\\\'99','Trademark'), - (22,'š','\\\'9a','s-caron'), - (23,'›','\\\'9b','Single right guillemet'), - (24,'œ','\\\'9c','oe ligature'), - (25,'ž','\\\'9e','z-caron'), - (26,'Ÿ','\\\'9f','Y-diaeresis'), - (27,'¡','\\\'a1','Inverted exclamation point'), - (28,'¢','\\\'a2','Cent sign'), - (29,'£','\\\'a3','Pound sign'), - (30,'¤','\\\'a4','General currency sign'), - (31,'¥','\\\'a5','Yen sign'), - (32,'¦','\\\'a6','Broken vertical bar'), - (33,'§','\\\'a7','Section sign'), - (34,'¨','\\\'a8','Spacing diaeresis'), - (35,'©','\\\'a9','Copyright'), - (36,'ª','\\\'aa','Feminine ordinal'), - (37,'«','\\\'ab','Left angle quotes'), - (38,'¬','\\\'ac','Not sign'), - (39,'(-)','\\-','Soft hyphen'), - (40,'®','\\\'ae','Registered trademark'), - (41,'¯','\\\'af','Macron accent'), - (42,'°','\\\'b0','Degree sign'), - (43,'±','\\\'b1','Plus or minus sign'), - (44,'²','\\\'b2','Superscript 2'), - (45,'³','\\\'b3','Superscript 3'), - (46,'´','\\\'b4','Acute accent'), - (47,'µ','\\\'b5','Micro sign (Greek mu)'), - (48,'¶','\\\'b6','Paragraph sign'), - (49,'·','\\\'b7','Middle dot'), - (50,'¸','\\\'b8','Cedilla'), - (51,'¹','\\\'b9','Superscript 1'), - (52,'º','\\\'ba','Masculine ordinal'), - (53,'»','\\\'bb','Right angle quotes'), - (54,'¼','\\\'bc','One-fourth fraction'), - (55,'½','\\\'bd','One-half fraction'), - (56,'¾','\\\'be','Three-fourths fraction'), - (57,'¿','\\\'bf','Inverted question mark'), - (58,'À','\\\'c0','A-grave'), - (59,'Á','\\\'c1','A-acute'), - (60,'Â','\\\'c2','A-circumflex'), - (61,'Ã','\\\'c3','A-tilde'), - (62,'Ä','\\\'c4','A-diaeresis'), - (63,'Å','\\\'c5','A-ring'), - (64,'Æ','\\\'c6','AE-ligature'), - (65,'Ç','\\\'c7','C-cedilla'), - (66,'È','\\\'c8','E-grave'), - (67,'É','\\\'c9','E-acute'), - (68,'Ê','\\\'ca','E-circumflex'), - (69,'Ë','\\\'cb','E-diaeresis'), - (70,'Ì','\\\'cc','I-grave'), - (71,'Í','\\\'cd','I-acute'), - (72,'Î','\\\'ce','I-circumflex'), - (73,'Ï','\\\'cf','I-diaeresis'), - (74,'Ð','\\\'d0','Uppercase edh'), - (75,'Ñ','\\\'d1','N-tilde'), - (76,'Ò','\\\'d2','O-grave'), - (77,'Ó','\\\'d3','O-acute'), - (78,'Ô','\\\'d4','O-circumflex'), - (79,'Õ','\\\'d5','O-tilde'), - (80,'Ö','\\\'d6','O-diaeresis'), - (81,'×','\\\'d7','Multiply sign'), - (82,'Ø','\\\'d8','O-slash'), - (83,'Ù','\\\'d9','U-grave'), - (84,'Ú','\\\'da','U-acute'), - (85,'Û','\\\'db','U-circumflex'), - (86,'Ü','\\\'dc','U-diaeresis'), - (87,'Ý','\\\'dd','Y-acute'), - (88,'Þ','\\\'de','Uppercase thorn'), - (89,'ß','\\\'df','German ess-zed'), - (90,'à','\\\'e0','a-grave'), - (91,'á','\\\'e1','a-acute'), - (92,'â','\\\'e2','a-circumflex'), - (93,'ã','\\\'e3','a-tilde'), - (94,'ä','\\\'e4','a-diaeresis'), - (95,'å','\\\'e5','a-ring'), - (96,'æ','\\\'e6','ae-ligature'), - (97,'ç','\\\'e7','c-cedilla'), - (98,'è','\\\'e8','e-grave'), - (99,'é','\\\'e9','e-acute'), - (100,'ê','\\\'ea','e-circumflex'), - (101,'ë','\\\'eb','e-diaeresis'), - (102,'ì','\\\'ec','i-grave'), - (103,'í','\\\'ed','i-acute'), - (104,'î','\\\'ee','i-circumflex'), - (105,'ï','\\\'ef','i-diaeresis'), - (106,'ð','\\\'f0','Lowercase edh'), - (107,'ñ','\\\'f1','n-tilde'), - (108,'ò','\\\'f2','o-grave'), - (109,'ó','\\\'f3','o-acute'), - (110,'ô','\\\'f4','o-circumflex'), - (111,'õ','\\\'f5','o-tilde'), - (112,'ö','\\\'f6','o-diaeresis'), - (113,'÷','\\\'f7','Division sign'), - (114,'ø','\\\'f8','o-slash'), - (115,'ù','\\\'f9','u-grave'), - (116,'ú','\\\'fa','u-acute'), - (117,'û','\\\'fb','u-circumflex'), - (118,'ü','\\\'fc','u-diaeresis'), - (119,'ý','\\\'fd','y-acute'), - (120,'þ','\\\'fe','Lowercase thorn'), - (121,'ÿ','\\\'ff','y-diaeresis'), - (122,'č','\\u269 ',NULL), - (123,'ć','\\u263 ',NULL), - (124,'β','\\u946 ','beta'), - (125,'Α','\\u913 ','Alpha'), - (126,'Β','\\u914 ','Beta'), - (127,'Γ','\\u915 ','Gamma'), - (128,'Δ','\\u916 ','Delta'), - (129,'Ε','\\u917 ','Epsilon'), - (130,'Ζ','\\u918 ','Zeta'), - (131,'Η','\\u919 ','Eta'), - (132,'Θ','\\u920 ','Theta'), - (133,'Ι','\\u921 ','Iota'), - (134,'Κ','\\u922 ','Kappa'), - (135,'Λ','\\u923 ','Lambda'), - (136,'Μ','\\u924 ','Mu'), - (137,'Ν','\\u925 ','Nu'), - (138,'Ξ','\\u926 ','Xi'), - (139,'Ο','\\u927 ','Omicron'), - (140,'Π','\\u928 ','Pi'), - (141,'Ρ','\\u929 ','Rho'), - (142,'Σ','\\u931 ','Sigma'), - (143,'Τ','\\u932 ','Tau'), - (144,'Υ','\\u933 ','Upsilon'), - (145,'Φ','\\u934 ','Phi'), - (146,'Χ','\\u935 ','Chi'), - (147,'Ψ','\\u936 ','Psi'), - (148,'Ω','\\u937 ','Omega'), - (149,'α','\\u945 ','Alpha'), - (150,'β','\\u946 ','Beta'), - (151,'γ','\\u947 ','Gamma'), - (152,'δ','\\u948 ','Delta'), - (153,'ε','\\u949 ','Epsilon'), - (154,'ζ','\\u950 ','Zeta'), - (155,'η','\\u951 ','Eta'), - (156,'θ','\\u952 ','Theta'), - (157,'ι','\\u953 ','Iota'), - (158,'κ','\\u954 ','Kappa'), - (159,'λ','\\u955 ','Lambda'), - (160,'μ','\\u956 ','Mu'), - (161,'ν','\\u957 ','Nu'), - (162,'ξ','\\u958 ','Xi'), - (163,'ο','\\u959 ','Omicron'), - (164,'π','\\u960 ','Pi'), - (165,'ρ','\\u961 ','Rho'), - (166,'σ','\\u963 ','Sigma'), - (167,'ς','\\u962 ','Sigma'), - (168,'τ','\\u964 ','Tau'), - (169,'υ','\\u965 ','Upsilon'), - (170,'φ','\\u966 ','Phi'), - (171,'χ','\\u967 ','Chi'), - (172,'ψ','\\u968 ','Psi'), - (173,'ω','\\u969 ','Omega'), - (174,'®','\\\'ae','reserved'), - (175,'ü','\\\'fc','u umlaut'), - (176,'ö','\\\'f6','o umlaut'), - (177,'é','\\\'e9','accented e'), - (178,'ç','\\\'e7','french c'), - (179,'…','\\\'85','ellipsis'), - (180,'ó','\\\'f3','accented o'), - (181,'™','\\\'99','trademark'), - (182,'≤','\\u8804 ','less than or equal to'), - (183,'≥','\\u8805 ','greater than or equal to'), - (184,'à','\\\'e0','accented a'), - (185,'ï','\\\'ef','i umlaut'), - (186,'—','\\\'97','long dash'), - (187,'→','\\u8594 ','right arrow'), - (188,'←','\\u8592 ','left arrow'), - (189,'°','\\\'b0','degree'), - (190,'á','\\\'e1','accented a'), - (191,'†','\\\'86','cross'), - (192,'è','\\\'e8','accented e'), - (193,'ê','\\\'ea','weird e'), - (194,'ã','\\\'e3','a with tilde'), - (195,'ß','\\\'df','beta'), - (196,'ū','\\u363 ','u with tilde'), - (197,'‡','\\\'87','double dagger'), - (198,'©','\\\'a9','copyright'), - (199,'∆','\\u8710 ','delta'), - (200,'í','\\\'ed','accented i'), - (201,'’','\\\'92','apostrophe'), - (202,'ë','\\\'eb','e with umlaut'), - (203,'ñ','\\\'f1','n with tilde'), - (204,'±','\\\'b1','plus or minus'), - (205,'”','\\\'94','double quotes'), - (206,'×','\\\'d7','x'), - (207,'Å','\\\'c5','a with circle'), - (208,'↔','\\u8596 ','double arrow'), - (209,'ä','\\u228 ','a with umlaut'), - (210,'“','\\\'81\\\'67','double quotes'), - (211,'•','\\u8226 ','bullet'), - (212,'∗','\\u8727 ','star'), - (213,'{','\\{','left brace'), - (214,'}','\\}','right brace'), - (215,'¹','\\\'b9','superscript 1'), - (216,'²','\\\'b2','superscript 2'), - (217,'³','\\\'b3','superscript 3'), - (218,'⁴','\\u8308 ','superscript 4'), - (219,'⁵','\\u8309 ','superscript 5'), - (220,'⁶','\\u8310 ','superscript 6'), - (221,'⁷','\\u8311 ','superscript 7'), - (222,'⁸','\\u8312 ','superscript 8'), - (223,'⁹','\\u8313 ','superscript 9'), - (224,'⁰','\\u8304 ','superscript 0'), - (225,'₁','\\u8321 ','subscript 1'), - (226,'₂','\\u8322 ','subscript 2'), - (227,'₃','\\u8323 ','subscript 3'), - (228,'₄','\\u8324 ','subscript 4'), - (229,'₅','\\u8325 ','subscript 5'), - (230,'₆','\\u8326 ','subscript 6'), - (231,'₇','\\u8327 ','subscript 7'), - (232,'₈','\\u8328 ','subscript 8'), - (233,'₉','\\u8329 ','subscript 9'), - (234,'₀','\\u8320 ','subscript 0'), - (235,'~','\\u8764 ','tilde'), - (236,'⁺','\\u8314 ','superscript plus'), - (237,'✰','\\u10032 ','star'), - (238,'·','\\\'b7','dot'), - (239,'–','\\\'96','dash'), - (240,'∩','\\u8745 ','intersection'), - (241,'‑','\\u8209 ','dash'), - (242,'☆','\\u9734 ','star'), - (243,'ɛ','\\u603 ','backwards 3'), - (244,'ô','\\\'f4','o with hat'), - (245,'fi','\\u64257 ','fi or something'), - (246,'ĸ','\\u312 ','k or something'), - (247,'ń','\\u324 ','accented n'), - (248,'″','\\u8243 ','quote'), - (249,'⁻','\\u8315 ','dash or something'), - (250,'‒','\\u8210 ','dash or something'), - (251,'ů','\\u367 ','u circle'), - (252,'√','\\u8730 ','checkmark'), - (253,'‘','\\\'91','apostrophe'), - (254,'ø','\\\'f8','o with slash'), - (255,'ú','\\\'fa','accented u'), - (256,'č','\\u269 ','c with caret'), - (257,'ć','\\u263 ','c with accent'), - (258,'ğ','\\u287 ','g with caret'), - (259,'ā','\\u257 ','a with line on top'), - (260,'õ','\\\'f5','o with tilde'), - (261,'ś','\\u347 ','accented s'), - (262,'î','\\\'ee','i with caret on top'), - (263,'ş','\\u351 ','s with squiggle'), - (264,'Ş','\\u350 ','capital s with squiggle'), - (265,'ʼ','\\u700 ','apostrophe'), - (266,'â','\\\'e2','a with caret'), - (267,'ı','\\u305 ','little i'), - (268,'ė','\\u279 ','e with dot'), - (269,'ł','\\u322 ','I with slash'), - (270,'ą','\\u261 ','a with squiggle'), - (271,'ę','\\u281 ','french e'), - (272,'ĭ','\\u301 ','i with half circle'), - (273,'ň','\\u328 ','n with caret'), - (274,'İ','\\u304 ','i with dot on top'), - (275,'ě','\\u283 ','e with caret'), - (276,'ǧ','\\u487 ','g with caret'), - (277,'ő','\\u337 ','o with two accents'), - (278,'û','\\\'fb','u with caret'), - (279,'ý','\\\'fd','y with accent'), - (280,'ź','\\u378 ','z with accent'), - (281,'ż','\\u380 ','z with dot'), - (282,'ű','\\u369 ','u with two accents'), - (283,'ŏ','\\u335 ','o with half circle'), - (284,'ī','\\u299 ','i with line on top'), - (285,'ӧ','\\u1255 ','o with umlaut'), - (286,'Đ','\\u272 ','d with slash'), - (287,'ř','\\u345 ','r with caret'), - (288,'ˇ','\\u711 ','caret'), - (289,'ă','\\u259 ','a with caret'), - (290,'ŕ','\\u341 ','r with accent'), - (291,'ĕ','\\u277 ','e with caret'), - (292,'ό','\\u972 ','o with accent'), - (293,'ũ','\\u361 ','u with tilde'), - (294,'׳','\\\'d7','apostrophe'); -UNLOCK TABLES; - -LOCK TABLES `admin_settings` WRITE; -TRUNCATE `admin_settings`; -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('displayMessages', '[{"labelUserKey":"messages","helpTextSettingsView":"Controls the displying of the success or error messages","isVisible":true}]', 'Display Messages'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('EmailNotifications', '[{"labelUserKey":"emailNotifications","labelSettingsView":"Enable email notifications","helpTextSettingsView":"Check to enable for all users. Once enabled, users with the \\"curator_self role\\" and a registered email will have the option to \\"Manage notifications.\\"","isVisible":true},{"labelUserKey":"emailSender","labelSettingsView":"Email sender","helpTextSettingsView":"Specify the \\"from\\" email address for the notifications. You can use either of these formats \\"publications@med.cornell.edu\\" or \\"WCM Publications \\"","labelUserView":"publications@med.cornell.edu"},{"labelUserKey":"emailSalutation","labelSettingsView":"Email body: salutation","labelUserView":"","helpTextSettingsView":"This text is the greeting portion of the email notification."},{"labelUserKey":"acceptedSubjectHeader","labelSettingsView":"Email body: \\"Accepted\\" section prefix","helpTextSettingsView":"This text precedes the list of any publications that have been accepted on behalf of a given person.","labelUserView":"The following publications have been accepted on your behalf"},{"labelUserKey":"suggestedSubjectHeader","labelSettingsView":"Email body: \\"Suggested\\" section prefix","helpTextSettingsView":"This text precedes the list of any publications that have been suggested for a given person.","labelUserView":"The following publications are pending for you"},{"labelUserKey":"acceptedEmailNotificationsLimit","labelSettingsView":"Email body: max accepted articles","helpTextSettingsView":"Select the maximum number of accepted publications to display in an email. We recommend a limit of 10. Note that this section excludes publications that have been accepted by the user themselves.","maxLimit":"5"},{"labelUserKey":"suggestedEmailNotificationsLimit","labelSettingsView":"Email body: max suggested articles","helpTextSettingsView":"Select the maximum number of suggested publications to display in an email. We recommend a limit of 10.","maxLimit":"10"},{"labelUserKey":"emailSignature","labelUserView":"Sincerely,\\nSamuel J. Wood Library\\nWeill Cornell Medicine\\n","labelSettingsView":"Email signature","helpTextSettingsView":"Define the signature that will appear at the end of the email."},{"labelUserKey":"testemailfunctionality","labelSettingsView":"Test emailing functionality","helpTextSettingsView":"Here you can test the email user functionality by inputting a person identifier, an email address recipient, and then clicking on \\"Send test email\\". If the Email Override field is blank, users will be contacted at their email of record as stored in the admin_users table.If the Email Override checkbox is selected, all notification emails from any regularly scheduled job will be sent to the email address specified in this field.","personIdentifier":"acs2001","emailOverride":"reciter2024@med.cornell.edu","useEmailForScheduledJobs":true,"submitButton":""}]', 'Email Notifications'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('findPeople', '[{"labelUserKey":"personIdentifier","labelUserView":"CWID","labelSettingsView":"Label for person identifier","helpTextSettingsView":"e.g.,NetID, CWID. Used throughout the application. "},{"labelUserKey":"organization","labelUserView":"Organization(s)","labelSettingsView":"Label for organizational unit","helpTextSettingsView":"e.g., Pediatrics"},{"labelUserKey":"institution","labelUserView":"Institution(s)","labelSettingsView":"Label for institution","helpTextSettingsView":"e.g., Cornell University"},{"labelUserKey":"personType","labelUserView":"Person Type(s)","labelSettingsView":"Label for person type","helpTextSettingsView":"e.g., academic-faculty,student-phd"}]', 'Find People'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('headshot', '[{"labelUserKey":"headshot","labelUserView":"Headshot","labelSettingsView":"Headshot","helpTextSettingsView":"Include the full URL for a third party headshot API where a token a personIdentifier is enclosed by braces","syntax":"https://directory.weill.cornell.edu/api/v1/person/profile/{personIdentifier}.png?returnGenericOn404=false"}]', 'Headshot'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('reportingArticleCSV', '[{"labelUserKey":"reportingArticleCSVLimit","labelSettingsView":"Maximum records for export to article CSV","helpTextSettingsView":"","maxLimit":"500000"},{"labelUserKey":"pmid","labelUserView":"PMID","labelSettingsView":"PMID","helpTextSettingsView":"","displayRank":"21","isVisible":true},{"labelUserKey":"Article title","labelUserView":"Article title","labelSettingsView":"Article title","helpTextSettingsView":"","displayRank":"2","isVisible":true},{"labelUserKey":"Article year","labelUserView":"Article year","labelSettingsView":"Article year","helpTextSettingsView":"","displayRank":"3","isVisible":true},{"labelUserKey":"pmcid","labelUserView":"PMCID","labelSettingsView":"PMCID","helpTextSettingsView":"","displayRank":4,"isVisible":true},{"labelUserKey":"Publication date display","labelUserView":"Publication date display","labelSettingsView":"Publication date display","helpTextSettingsView":"","displayRank":5,"isVisible":true},{"labelUserKey":"Date standardized","labelUserView":"Publication date standardized","labelSettingsView":"Publication date standardized","helpTextSettingsView":"","displayRank":6,"isVisible":true},{"labelUserKey":"Date added","labelUserView":"Date added","labelSettingsView":"Date added","helpTextSettingsView":"","displayRank":7,"isVisible":true},{"labelUserKey":"Journal","labelUserView":"Journal Title Verbose","labelSettingsView":"Journal Title Verbose","helpTextSettingsView":"","displayRank":8,"isVisible":true},{"labelUserKey":"doi","labelUserView":"DOI","labelSettingsView":"DOI","helpTextSettingsView":"","displayRank":9,"isVisible":true},{"labelUserKey":"Issue","labelUserView":"Issue","labelSettingsView":"Issue","helpTextSettingsView":"","displayRank":10,"isVisible":true},{"labelUserKey":"Pages","labelUserView":"Pages","labelSettingsView":"Pages","helpTextSettingsView":"","displayRank":11,"isVisible":true},{"labelUserKey":"Volume","labelUserView":"Volume","labelSettingsView":"Volume","helpTextSettingsView":"","displayRank":12,"isVisible":true},{"labelUserKey":"Scimago Journal Rank","labelUserView":"Scimago Journal Rank","labelSettingsView":"Scimago Journal Rank","helpTextSettingsView":"","displayRank":13,"isVisible":true},{"labelUserKey":"Mendeley readers","labelUserView":"Mendeley readers","labelSettingsView":"Mendeley readers","helpTextSettingsView":"","displayRank":14,"isVisible":true},{"labelUserKey":"NIH Relative Citation Ratio","labelUserView":"NIH Relative Citation Ratio","labelSettingsView":"NIH Relative Citation Ratio","helpTextSettingsView":"","displayRank":15,"isVisible":true},{"labelUserKey":"NIH Percentile Rank","labelUserView":"NIH Percentile Rank","labelSettingsView":"NIH Percentile Rank","helpTextSettingsView":"","displayRank":16,"isVisible":true},{"labelUserKey":"TrendingPubs score","labelUserView":"TrendingPubs score","labelSettingsView":"TrendingPubs score","helpTextSettingsView":"","displayRank":17,"isVisible":false},{"labelUserKey":"Citation count (NIH)","labelUserView":"Citation count (NIH)","labelSettingsView":"Citation count (NIH)","helpTextSettingsView":"","displayRank":18,"isVisible":true},{"labelUserKey":"Citation count (Scopus)","labelUserView":"Citation count (Scopus)","labelSettingsView":"Citation count (Scopus)","helpTextSettingsView":"","displayRank":19,"isVisible":false},{"labelUserKey":"Authors","labelUserView":"Author(s)","labelSettingsView":"Author(s)","helpTextSettingsView":"","displayRank":"1","isVisible":true},{"labelUserKey":"Journal Metric","labelUserView":"Journal Metric","labelSettingsView":"Journal Metric","helpTextSettingsView":"An additional journal-level metric","displayRank":"20","isVisible":true},{"labelUserKey":"Author Position","labelUserView":"Author Position","labelSettingsView":"Author Position","helpTextSettingsView":"Author Position","displayRank":"21","isVisible":true}]', 'Reporting Article CSV'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('reportingArticleRTF', '[{"labelUserKey":"reportingArticleRTFLimit","labelSettingsView":"Maximum records for export to article RTF","helpTextSettingsView":"Maximum number of article records a user can export to RTF. In testing, we have found the export fails after 40,000 records. ","maxLimit":"1000","isValidate":false,"errorMessage":"Limit cannot exceed 40000"}]', 'Reporting Article RTF'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('reportingAuthorshipCSV', '[{"labelUserKey":"reportingAuthorshipCSVLimit","labelSettingsView":"Maximum records for export to article CSV","helpTextSettingsView":"","maxLimit":"500000"},{"labelUserKey":"personIdentifier","labelUserView":"PersonIdentifier","labelSettingsView":"CWID","helpTextSettingsView":"","displayRank":"1","isVisible":false},{"labelUserKey":"Last Name","labelUserView":"Last Name","labelSettingsView":"Last Name","helpTextSettingsView":"","displayRank":"2","isVisible":false},{"labelUserKey":"First Name","labelUserView":"First Name","labelSettingsView":"First Name","helpTextSettingsView":"","displayRank":"3","isVisible":false},{"labelUserKey":"Organization","labelUserView":"Organization","labelSettingsView":"Organization","helpTextSettingsView":"","displayRank":"4","isVisible":true},{"labelUserKey":"Institution","labelUserView":"Institution","labelSettingsView":"Institution","helpTextSettingsView":"","displayRank":"5","isVisible":true},{"labelUserKey":"Author Position","labelUserView":"Author Position","labelSettingsView":"Author Position","helpTextSettingsView":"Author Position","displayRank":"6","isVisible":true},{"labelUserKey":"pmid","labelUserView":"PMID","labelSettingsView":"PMID","helpTextSettingsView":"","displayRank":7,"isVisible":true},{"labelUserKey":"Article title","labelUserView":"Article title","labelSettingsView":"Article title","helpTextSettingsView":"","displayRank":8,"isVisible":true},{"labelUserKey":"Article year","labelUserView":"Article year","labelSettingsView":"Article year","helpTextSettingsView":"","displayRank":9,"isVisible":true},{"labelUserKey":"pmcid","labelUserView":"PMCID","labelSettingsView":"PMCID","helpTextSettingsView":"","displayRank":10,"isVisible":true},{"labelUserKey":"Publication date display","labelUserView":"Publication date display","labelSettingsView":"Publication date display","helpTextSettingsView":"","displayRank":11,"isVisible":true},{"labelUserKey":"Date standardized","labelUserView":"Date standardized","labelSettingsView":"Publication date standardized","helpTextSettingsView":"","displayRank":12,"isVisible":true},{"labelUserKey":"Date added","labelUserView":"Date added","labelSettingsView":"Date added","helpTextSettingsView":"","displayRank":13,"isVisible":true},{"labelUserKey":"Journal","labelUserView":"Journal title","labelSettingsView":"Journal title","helpTextSettingsView":"","displayRank":14,"isVisible":true},{"labelUserKey":"doi","labelUserView":"DOI","labelSettingsView":"DOI","helpTextSettingsView":"","displayRank":15,"isVisible":true},{"labelUserKey":"Issue","labelUserView":"Issue","labelSettingsView":"Issue","helpTextSettingsView":"","displayRank":16,"isVisible":true},{"labelUserKey":"Pages","labelUserView":"Pages","labelSettingsView":"Pages","helpTextSettingsView":"","displayRank":17,"isVisible":true},{"labelUserKey":"Volume","labelUserView":"Volume","labelSettingsView":"Volume","helpTextSettingsView":"","displayRank":18,"isVisible":true},{"labelUserKey":"Scimago Journal Rank","labelUserView":"Scimago Journal Rank","labelSettingsView":"Scimago Journal Rank","helpTextSettingsView":"","displayRank":19,"isVisible":true},{"labelUserKey":"Mendeley readers","labelUserView":"Mendeley readers","labelSettingsView":"Mendeley readers","helpTextSettingsView":"","displayRank":20,"isVisible":true},{"labelUserKey":"NIH Relative Citation Ratio","labelUserView":"NIH Relative Citation Ratio","labelSettingsView":"NIH Relative Citation Ratio","helpTextSettingsView":"","displayRank":21,"isVisible":true},{"labelUserKey":"NIH Percentile Rank","labelUserView":"NIH Percentile Rank","labelSettingsView":"NIH Percentile Rank","helpTextSettingsView":"","displayRank":22,"isVisible":true},{"labelUserKey":"TrendingPubs score","labelUserView":"TrendingPubs score","labelSettingsView":"TrendingPubs score","helpTextSettingsView":"","displayRank":23,"isVisible":false},{"labelUserKey":"Citation count (NIH)","labelUserView":"Citation count (NIH)","labelSettingsView":"Citation count (NIH) ","helpTextSettingsView":"","displayRank":24,"isVisible":true},{"labelUserKey":"Citation count (Scopus)","labelUserView":"Citation count (Scopus) ","labelSettingsView":"Citation count (Scopus) ","helpTextSettingsView":"","displayRank":25,"isVisible":false},{"labelUserKey":"Person types","labelUserView":"Person type(s)","labelSettingsView":"Person type(s)","helpTextSettingsView":"","displayRank":26,"isVisible":true},{"labelUserKey":"Authors","labelUserView":"Author(s)","labelSettingsView":"Author(s)","helpTextSettingsView":"","displayRank":27,"isVisible":true},{"labelUserKey":"Journal","labelUserView":"Journal Title Verbose","labelSettingsView":"Journal Title Verbose","helpTextSettingsView":"","displayRank":28,"isVisible":true},{"labelUserKey":"Journal Metric","labelUserView":"Journal Metric","labelSettingsView":"Journal Metric","helpTextSettingsView":"An additional journal-level metric","displayRank":"20","isVisible":true}]', 'Reporting Authorship CSV'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('reportingFilters', '[{"labelUserKey":"Author","labelUserView":"Author","labelSettingsView":"Author","helpTextSettingsView":""},{"labelUserKey":"Organization","labelUserView":"Organization","labelSettingsView":"Organization","helpTextSettingsView":""},{"labelUserKey":"Institution","labelUserView":"Institution","labelSettingsView":"Institution","helpTextSettingsView":""},{"labelUserKey":"Person Type","labelUserView":"Person Type","labelSettingsView":"Person Type","helpTextSettingsView":""},{"labelUserKey":"Author Position","labelUserView":"Author Position","labelSettingsView":"Author Position","helpTextSettingsView":""},{"labelUserKey":"Date","labelUserView":"Date","labelSettingsView":"Date","helpTextSettingsView":""},{"labelUserKey":"Type","labelUserView":"Type","labelSettingsView":"Type","helpTextSettingsView":""},{"labelUserKey":"Journal","labelUserView":"Journal","labelSettingsView":"Journal","helpTextSettingsView":""},{"labelUserKey":"Journal Rank","labelUserView":"Journal Rank","labelSettingsView":"Journal Rank","helpTextSettingsView":""}]', 'Reporting Filters'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('reportingWebDisplay', '[{"labelUserKey":"Citation count (NIH)","labelUserView":"Citation count (NIH)","labelSettingsView":"Citation count (NIH)","helpTextUserView":"This is the test number of citations an article has received from CrossRef, MEDLINE, PubMed Central, and Entrez. NIH citation counts generally correlate closely with the counts in Scopus and Web of Knowledge. ","helpTextSettingsView":"","displayRank":"3","isVisible":false},{"labelUserKey":"Citation count (Scopus)","labelUserView":"Citation count (Scopus)","labelSettingsView":"Citation count (Scopus)","helpTextUserView":"","helpTextSettingsView":"","displayRank":"1","isVisible":true},{"labelUserKey":"Percentile Rank","labelUserView":"Percentile Rank","labelSettingsView":"NIH Percentile Rank","helpTextUserView":"NIH percentile is the value of RCR provided as a percentile in which 100 is the highest and 0 is the lowest. For example, if an article has an NIH percentile of 63.2, it has received more citations than 631 articles when measured against a field and time-weighted benchmark of 1,000 NIH-funded research articles from the same year. A percentile is not computed for an article published in the past two years.","helpTextSettingsView":"","displayRank":"2","isVisible":true},{"labelUserKey":"Relative Citation Ratio (NIH)","labelUserView":"Relative Citation Ratio (NIH)","labelSettingsView":"Relative Citation Ratio (NIH)","helpTextSettingsView":"","helpTextUserView":"Relative Citation Ratio (RCR) is the ratio between the number of times an article was cited in comparison to publications of the same date and field (as inferred by co-citation networks). A value of 1.0 is the median. Higher is better. The benchmark consists of research articles that are the product of R01 grants, the NIH''''s most prestigious and competitive funding mechanism.","displayRank":4,"isVisible":true},{"labelUserKey":"Journal Rank","labelUserView":"Journal Rank","labelSettingsView":"Scimago Journal Rank","helpTextUserView":"SCImago Journal Rank is a measure of the relative number of inbound citations articles in a given journal receive compared to outbound citations. It is closely correlated with Journal Impact Factor.","helpTextSettingsView":"Journal Rank Help text","displayRank":"5","isVisible":true},{"labelUserKey":"Journal Metric","labelUserView":"Journal Impact","labelSettingsView":"Journal Metric","helpTextUserView":"","helpTextSettingsView":"","displayRank":6,"isVisible":true},{"labelUserKey":"readersMendeley","labelUserView":"Mendeley readers","labelSettingsView":"Mendeley readers","helpTextUserView":"","helpTextSettingsView":"","displayRank":7,"isVisible":false},{"labelUserKey":"TrendingPubs score","labelUserView":"TrendingPubs score","labelSettingsView":"TrendingPubs score","helpTextUserView":"","helpTextSettingsView":"","displayRank":8,"isVisible":false}]', 'Reporting Web Display'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('reportingWebViewSort', '[{"labelUserKey":"datePublicationAddedToEntrez","labelUserView":"Date added","labelSettingsView":"Date added","helpTextSettingsView":"","displayRank":"2","isVisible":true},{"labelUserKey":"citationCountNIH","labelUserView":"Citation count (NIH)","labelSettingsView":"Citation count (NIH)","displayRank":"1","helpTextSettingsView":"","isVisible":true},{"labelUserKey":"citationCountScopus","labelUserView":"Citation count (Scopus)","labelSettingsView":"Citation count (Scopus)","helpTextSettingsView":"","displayRank":"3","isVisible":true},{"labelUserKey":"journalImpactScore1","labelUserView":"Scimago Journal Rank","labelSettingsView":"Scimago Journal Rank","helpTextSettingsView":"","isVisible":true,"displayRank":"4"},{"labelUserKey":"journalImpactScore2","labelUserView":"Journal Metric","labelSettingsView":"Journal Metric","helpTextSettingsView":"","displayRank":"5","isVisible":true},{"labelUserKey":"readersMendeley","labelUserView":"Mendeley readers","labelSettingsView":"Mendeley readers","helpTextSettingsView":"","isVisible":false,"displayRank":"6"},{"labelUserKey":"percentileNIH","labelUserView":"NIH Percentile Rank","labelSettingsView":"NIH Percentile Rank","helpTextSettingsView":"","isVisible":true,"displayRank":"7"},{"labelUserKey":"relativeCitationRatioNIH","labelUserView":"Relative Citation Ratio (NIH)","labelSettingsView":"Relative Citation Ratio (NIH)","helpTextSettingsView":"","isVisible":true,"displayRank":"8"},{"labelUserKey":"trendingPubsScore","labelUserView":"TrendingPubs score","labelSettingsView":"TrendingPubs score","helpTextSettingsView":"","isVisible":false,"displayRank":"9"}]', 'Reporting Web View Sort'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('userRoles', '[{"labelUserKey":"Roles````","labelSettingsView":"Select the role","inputType":"check","isRoleGroup":false,"roles":[{"roleId":3,"roleName":"Repoter_All","roleLabel":"Automatically grant all successfully authenticated users the reporter_all role.","isChecked":true},{"roleId":2,"roleName":"Curator_All","roleLabel":"Automatically grant all successfully authenticated users the curator_all role","isChecked":false}]}]', 'User Default Role(s)'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('viewProfile', '[{"labelUserKey":"h-index","labelUserView":"h-index","labelSettingsView":"h-index (NIH)","helpTextUserView":"h-index is the number of an author''s articles in PubMed that have been cited, as defined by NIH''s iCite service, at least that many times. ","helpTextSettingsView":""},{"labelUserKey":"h5-index","labelUserView":"h5-index","labelSettingsView":"h5-index (NIH)","helpTextUserView":"h5-index is the number of an author''s articles in PubMed that have been cited, as defined by NIH''s iCite service, at least that many times within the past 5 years. ","helpTextSettingsView":""},{"labelUserKey":"hindexScopus","labelUserView":"h-index (Scopus)","labelSettingsView":"h-index (Scopus)","helpTextUserView":"h-index is the number of an author''s articles in Scopus that have been cited, as defined by Scopus, at least that many times.","helpTextSettingsView":""},{"labelUserKey":"h5IndexScopus","labelUserView":"h5-index (Scopus)","labelSettingsView":"h5-index (Scopus)","helpTextUserView":"h-index is the number of an author''s articles in Scopus published in the last five years that have been cited, as defined by Scopus, at least that many times.","helpTextSettingsView":""}]', 'View Profile'); -UNLOCK TABLES; diff --git a/setup/populateAnalysisSummaryTables_v2.sql b/setup/populateAnalysisSummaryTables_v2.sql index de2bfbd..e2a6e3f 100644 --- a/setup/populateAnalysisSummaryTables_v2.sql +++ b/setup/populateAnalysisSummaryTables_v2.sql @@ -420,17 +420,19 @@ proc_main: BEGIN INSERT INTO analysis_summary_article_new ( pmid, pmcid, publicationTypeCanonical, articleYear, publicationDateStandardized, publicationDateDisplay, - datePublicationAddedToEntrez, articleTitle, journalTitleVerbose, + datePublicationAddedToEntrez, datePublicationAddedToPMC, + articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, citationCountScopus ) SELECT DISTINCT pmid, MAX(pmcid), publicationTypeCanonical, - articleYear, + IF(articleYear != 0, articleYear, LEFT(publicationDateStandardized, 4)), MIN(publicationDateStandardized), publicationDateDisplay, datePublicationAddedToEntrez, + MAX(datePublicationAddedToPMC), articleTitle, journalTitleVerbose, issn, @@ -564,53 +566,106 @@ proc_main: BEGIN CALL log_progress(v_job_id, v_step, 'Complete', 'DONE', NULL, CONCAT(TIMESTAMPDIFF(SECOND, v_start_time, NOW()), 's elapsed')); + -- ======================================================================== -- STEP 4: Populate analysis_summary_person_new -- ======================================================================== SET v_step = '4. Populate analysis_summary_person'; CALL log_progress(v_job_id, v_step, 'Inserting person records', 'RUNNING', NULL, NULL); - INSERT INTO analysis_summary_person_new (personIdentifier, nameFirst, nameMiddle, nameLast, facultyRank, department) - SELECT DISTINCT - p.personIdentifier, - p.firstName, - p.middleName, - p.lastName, - p.title, - p.primaryOrganizationalUnit - FROM person p - JOIN analysis_summary_person_scope s ON s.personIdentifier = p.personIdentifier; + -- Populate using person_person_type to derive facultyRank + INSERT INTO analysis_summary_person_new (personIdentifier, nameFirst, nameMiddle, nameLast, department, facultyRank) + SELECT * FROM ( + SELECT DISTINCT + p.personIdentifier, + p.firstName AS nameFirst, + p.middleName AS nameMiddle, + p.lastName AS nameLast, + p.primaryOrganizationalUnit AS department, + COALESCE(a.facultyRank, b.facultyRank, c.facultyRank, d.facultyRank) AS facultyRank + FROM person p + + LEFT JOIN ( + SELECT personIdentifier, 'Full Professor' AS facultyRank + FROM person_person_type + WHERE personType = 'academic-faculty-fullprofessor' + ) a ON a.personIdentifier = p.personIdentifier + + LEFT JOIN ( + SELECT personIdentifier, 'Associate Professor' AS facultyRank + FROM person_person_type + WHERE personType = 'academic-faculty-associate' + ) b ON b.personIdentifier = p.personIdentifier + + LEFT JOIN ( + SELECT personIdentifier, 'Assistant Professor' AS facultyRank + FROM person_person_type + WHERE personType = 'academic-faculty-assistant' + ) c ON c.personIdentifier = p.personIdentifier + + LEFT JOIN ( + SELECT personIdentifier, 'Instructor or Lecturer' AS facultyRank + FROM person_person_type + WHERE personType IN ('academic-faculty-instructor', 'academic-faculty-lecturer') + ) d ON d.personIdentifier = p.personIdentifier + + INNER JOIN analysis_summary_person_scope e ON e.personIdentifier = p.personIdentifier + ) x + WHERE facultyRank IS NOT NULL; SET v_rows = ROW_COUNT(); CALL log_progress(v_job_id, v_step, 'Inserted person records', 'INFO', v_rows, NULL); - -- Update article counts + -- ======================================================================== + -- STEP 4b: Compute article counts + -- Counts are for articles with publicationTypeNIH = 'Research Article' + -- and percentileNIH is not null + -- ======================================================================== CALL log_progress(v_job_id, v_step, 'Updating article counts', 'RUNNING', NULL, NULL); + + -- countAll: Count of research articles with NIH percentile UPDATE analysis_summary_person_new p JOIN ( - SELECT personIdentifier, COUNT(DISTINCT pmid) AS cnt - FROM analysis_summary_author_new - GROUP BY personIdentifier - ) c ON c.personIdentifier = p.personIdentifier - SET p.countAll = c.cnt; + SELECT s.personIdentifier, COUNT(a1.pmid) AS count + FROM analysis_summary_person_new s + JOIN analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE publicationTypeNIH = 'Research Article' AND percentileNIH IS NOT NULL + GROUP BY s.personIdentifier + ) x ON x.personIdentifier = p.personIdentifier + SET p.countAll = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated countAll', 'INFO', v_rows, NULL); + -- countFirst: Count of first-authored research articles with NIH percentile UPDATE analysis_summary_person_new p JOIN ( - SELECT personIdentifier, COUNT(DISTINCT pmid) AS cnt - FROM analysis_summary_author_new - WHERE authorPosition = 'first' - GROUP BY personIdentifier - ) c ON c.personIdentifier = p.personIdentifier - SET p.countFirst = c.cnt; + SELECT s.personIdentifier, COUNT(a1.pmid) AS count + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE publicationTypeNIH = 'Research Article' AND percentileNIH IS NOT NULL + AND a.authorPosition = 'first' + GROUP BY s.personIdentifier + ) x ON x.personIdentifier = p.personIdentifier + SET p.countFirst = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated countFirst', 'INFO', v_rows, NULL); + -- countSenior: Count of senior/last-authored research articles with NIH percentile UPDATE analysis_summary_person_new p JOIN ( - SELECT personIdentifier, COUNT(DISTINCT pmid) AS cnt - FROM analysis_summary_author_new - WHERE authorPosition = 'last' - GROUP BY personIdentifier - ) c ON c.personIdentifier = p.personIdentifier - SET p.countSenior = c.cnt; + SELECT s.personIdentifier, COUNT(a1.pmid) AS count + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE publicationTypeNIH = 'Research Article' AND percentileNIH IS NOT NULL + AND a.authorPosition = 'last' + GROUP BY s.personIdentifier + ) x ON x.personIdentifier = p.personIdentifier + SET p.countSenior = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated countSenior', 'INFO', v_rows, NULL); IF v_error_occurred THEN CALL log_progress(v_job_id, v_step, 'Failed', 'ERROR', NULL, v_error_message); @@ -621,152 +676,419 @@ proc_main: BEGIN CALL log_progress(v_job_id, v_step, 'Complete', 'DONE', NULL, CONCAT(TIMESTAMPDIFF(SECOND, v_start_time, NOW()), 's elapsed')); -- ======================================================================== - -- STEP 5: Compute percentile rankings (with rank and denominator) + -- STEP 5: Compute percentile rankings (peer-based) + -- Percentile = average of top N articles by percentileNIH + -- Denominator = count of people with same facultyRank who have the metric + -- Rank = rank within facultyRank by percentile value -- ======================================================================== SET v_step = '5. Compute percentile rankings'; - CALL log_progress(v_job_id, v_step, 'Computing percentiles (8 metrics with rank/denominator)', 'RUNNING', NULL, NULL); + CALL log_progress(v_job_id, v_step, 'Computing percentiles (peer-based avg of top N)', 'RUNNING', NULL, NULL); + + -- ======================================================================== + -- 5a. TOP 5 PERCENTILE - ALL POSITIONS + -- ======================================================================== - -- Top 5 percentile, first/last authored + -- top5PercentileAll: Average of top 5 percentiles (requires countAll > 4) UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - AND authorPosition IN ('first', 'last') - GROUP BY a.personIdentifier + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + JOIN analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL AND s.countAll > 4 + ) y + WHERE article_rank < 6 + GROUP BY personIdentifier ) x ON x.personIdentifier = p.personIdentifier - SET p.top5PercentileFirstSenior = x.pct, - p.top5RankFirstSenior = x.rank_count, - p.top5DenominatorFirstSenior = x.denominator; + SET p.top5PercentileAll = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5PercentileAll', 'INFO', v_rows, NULL); - -- Top 10 percentile, first/last authored + -- top5DenominatorAll: Count of people in same facultyRank with valid top5PercentileAll UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - AND authorPosition IN ('first', 'last') - GROUP BY a.personIdentifier + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top5PercentileAll IS NOT NULL AND countAll > 4 + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top5DenominatorAll = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5DenominatorAll', 'INFO', v_rows, NULL); + + -- top5RankAll: Rank within facultyRank by top5PercentileAll + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top5PercentileAll DESC) AS personRank + FROM analysis_summary_person_new + WHERE countAll > 4 ) x ON x.personIdentifier = p.personIdentifier - SET p.top10PercentileFirstSenior = x.pct, - p.top10RankFirstSenior = x.rank_count, - p.top10DenominatorFirstSenior = x.denominator; + SET p.top5RankAll = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5RankAll', 'INFO', v_rows, NULL); + + -- ======================================================================== + -- 5b. TOP 10 PERCENTILE - ALL POSITIONS + -- ======================================================================== - -- Top 5 percentile, first authored only + -- top10PercentileAll: Average of top 10 percentiles (requires countAll > 9) UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - AND authorPosition = 'first' - GROUP BY a.personIdentifier + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + JOIN analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL AND s.countAll > 9 + ) y + WHERE article_rank < 11 + GROUP BY personIdentifier ) x ON x.personIdentifier = p.personIdentifier - SET p.top5PercentileFirst = x.pct, - p.top5RankFirst = x.rank_count, - p.top5DenominatorFirst = x.denominator; + SET p.top10PercentileAll = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10PercentileAll', 'INFO', v_rows, NULL); - -- Top 10 percentile, first authored only + -- top10DenominatorAll: Count of people in same facultyRank with valid top10PercentileAll UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - AND authorPosition = 'first' - GROUP BY a.personIdentifier + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top10PercentileAll IS NOT NULL AND countAll > 9 + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top10DenominatorAll = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10DenominatorAll', 'INFO', v_rows, NULL); + + -- top10RankAll: Rank within facultyRank by top10PercentileAll + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top10PercentileAll DESC) AS personRank + FROM analysis_summary_person_new + WHERE countAll > 9 ) x ON x.personIdentifier = p.personIdentifier - SET p.top10PercentileFirst = x.pct, - p.top10RankFirst = x.rank_count, - p.top10DenominatorFirst = x.denominator; + SET p.top10RankAll = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10RankAll', 'INFO', v_rows, NULL); + + -- ======================================================================== + -- 5c. TOP 5 PERCENTILE - FIRST AUTHOR ONLY + -- ======================================================================== - -- Top 5 percentile, last authored only + -- top5PercentileFirst: Average of top 5 percentiles for first-authored (requires countFirst > 4) UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - AND authorPosition = 'last' - GROUP BY a.personIdentifier + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL AND s.countFirst > 4 + AND a.authorPosition = 'first' + ) y + WHERE article_rank < 6 + GROUP BY personIdentifier ) x ON x.personIdentifier = p.personIdentifier - SET p.top5PercentileSenior = x.pct, - p.top5RankSenior = x.rank_count, - p.top5DenominatorSenior = x.denominator; + SET p.top5PercentileFirst = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5PercentileFirst', 'INFO', v_rows, NULL); - -- Top 10 percentile, last authored only + -- top5DenominatorFirst UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - AND authorPosition = 'last' - GROUP BY a.personIdentifier + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top5PercentileFirst IS NOT NULL AND countFirst > 4 + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top5DenominatorFirst = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5DenominatorFirst', 'INFO', v_rows, NULL); + + -- top5RankFirst + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top5PercentileFirst DESC) AS personRank + FROM analysis_summary_person_new + WHERE countFirst > 4 ) x ON x.personIdentifier = p.personIdentifier - SET p.top10PercentileSenior = x.pct, - p.top10RankSenior = x.rank_count, - p.top10DenominatorSenior = x.denominator; + SET p.top5RankFirst = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5RankFirst', 'INFO', v_rows, NULL); - -- Top 5 percentile, all positions + -- ======================================================================== + -- 5d. TOP 10 PERCENTILE - FIRST AUTHOR ONLY + -- ======================================================================== + + -- top10PercentileFirst: Average of top 10 percentiles for first-authored (requires countFirst > 9) UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - GROUP BY a.personIdentifier + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL AND s.countFirst > 9 + AND a.authorPosition = 'first' + ) y + WHERE article_rank < 11 + GROUP BY personIdentifier ) x ON x.personIdentifier = p.personIdentifier - SET p.top5PercentileAll = x.pct, - p.top5RankAll = x.rank_count, - p.top5DenominatorAll = x.denominator; + SET p.top10PercentileFirst = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10PercentileFirst', 'INFO', v_rows, NULL); - -- Top 10 percentile, all positions + -- top10DenominatorFirst UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - GROUP BY a.personIdentifier + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top10PercentileFirst IS NOT NULL AND countFirst > 9 + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top10DenominatorFirst = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10DenominatorFirst', 'INFO', v_rows, NULL); + + -- top10RankFirst + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top10PercentileFirst DESC) AS personRank + FROM analysis_summary_person_new + WHERE countFirst > 9 + ) x ON x.personIdentifier = p.personIdentifier + SET p.top10RankFirst = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10RankFirst', 'INFO', v_rows, NULL); + + -- ======================================================================== + -- 5e. TOP 5 PERCENTILE - SENIOR/LAST AUTHOR ONLY + -- ======================================================================== + + -- top5PercentileSenior: Average of top 5 percentiles for last-authored (requires countSenior > 4) + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL AND s.countSenior > 4 + AND a.authorPosition = 'last' + ) y + WHERE article_rank < 6 + GROUP BY personIdentifier + ) x ON x.personIdentifier = p.personIdentifier + SET p.top5PercentileSenior = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5PercentileSenior', 'INFO', v_rows, NULL); + + -- top5DenominatorSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top5PercentileSenior IS NOT NULL AND countSenior > 4 + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top5DenominatorSenior = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5DenominatorSenior', 'INFO', v_rows, NULL); + + -- top5RankSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top5PercentileSenior DESC) AS personRank + FROM analysis_summary_person_new + WHERE countSenior > 4 + ) x ON x.personIdentifier = p.personIdentifier + SET p.top5RankSenior = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5RankSenior', 'INFO', v_rows, NULL); + + -- ======================================================================== + -- 5f. TOP 10 PERCENTILE - SENIOR/LAST AUTHOR ONLY + -- ======================================================================== + + -- top10PercentileSenior: Average of top 10 percentiles for last-authored (requires countSenior > 9) + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL AND s.countSenior > 9 + AND a.authorPosition = 'last' + ) y + WHERE article_rank < 11 + GROUP BY personIdentifier + ) x ON x.personIdentifier = p.personIdentifier + SET p.top10PercentileSenior = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10PercentileSenior', 'INFO', v_rows, NULL); + + -- top10DenominatorSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top10PercentileSenior IS NOT NULL AND countSenior > 9 + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top10DenominatorSenior = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10DenominatorSenior', 'INFO', v_rows, NULL); + + -- top10RankSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top10PercentileSenior DESC) AS personRank + FROM analysis_summary_person_new + WHERE countSenior > 9 + ) x ON x.personIdentifier = p.personIdentifier + SET p.top10RankSenior = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10RankSenior', 'INFO', v_rows, NULL); + + -- ======================================================================== + -- 5g. TOP 5 PERCENTILE - FIRST OR SENIOR (combined) + -- Note: countFirstSenior is computed inline since column doesn't exist + -- ======================================================================== + + -- top5PercentileFirstSenior: Average of top 5 percentiles for first/last authored + -- Requires at least 5 first+last authored articles with percentileNIH + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL + AND a.authorPosition IN ('first', 'last') + AND s.personIdentifier IN ( + -- Only include people with > 4 first/last articles + SELECT s2.personIdentifier + FROM analysis_summary_person_new s2 + Join analysis_summary_author_new a2 ON a2.personIdentifier = s2.personIdentifier + Join analysis_summary_article_new a12 ON a12.pmid = a2.pmid + WHERE a12.publicationTypeNIH = 'Research Article' AND a12.percentileNIH IS NOT NULL + AND a2.authorPosition IN ('first', 'last') + GROUP BY s2.personIdentifier + HAVING COUNT(*) > 4 + ) + ) y + WHERE article_rank < 6 + GROUP BY personIdentifier + ) x ON x.personIdentifier = p.personIdentifier + SET p.top5PercentileFirstSenior = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5PercentileFirstSenior', 'INFO', v_rows, NULL); + + -- top5DenominatorFirstSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top5PercentileFirstSenior IS NOT NULL + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top5DenominatorFirstSenior = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5DenominatorFirstSenior', 'INFO', v_rows, NULL); + + -- top5RankFirstSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top5PercentileFirstSenior DESC) AS personRank + FROM analysis_summary_person_new + WHERE top5PercentileFirstSenior IS NOT NULL + ) x ON x.personIdentifier = p.personIdentifier + SET p.top5RankFirstSenior = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5RankFirstSenior', 'INFO', v_rows, NULL); + + -- ======================================================================== + -- 5h. TOP 10 PERCENTILE - FIRST OR SENIOR (combined) + -- ======================================================================== + + -- top10PercentileFirstSenior (requires > 9 first/last articles) + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL + AND a.authorPosition IN ('first', 'last') + AND s.personIdentifier IN ( + -- Only include people with > 9 first/last articles + SELECT s2.personIdentifier + FROM analysis_summary_person_new s2 + Join analysis_summary_author_new a2 ON a2.personIdentifier = s2.personIdentifier + Join analysis_summary_article_new a12 ON a12.pmid = a2.pmid + WHERE a12.publicationTypeNIH = 'Research Article' AND a12.percentileNIH IS NOT NULL + AND a2.authorPosition IN ('first', 'last') + GROUP BY s2.personIdentifier + HAVING COUNT(*) > 9 + ) + ) y + WHERE article_rank < 11 + GROUP BY personIdentifier + ) x ON x.personIdentifier = p.personIdentifier + SET p.top10PercentileFirstSenior = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10PercentileFirstSenior', 'INFO', v_rows, NULL); + + -- top10DenominatorFirstSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top10PercentileFirstSenior IS NOT NULL + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top10DenominatorFirstSenior = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10DenominatorFirstSenior', 'INFO', v_rows, NULL); + + -- top10RankFirstSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top10PercentileFirstSenior DESC) AS personRank + FROM analysis_summary_person_new + WHERE top10PercentileFirstSenior IS NOT NULL ) x ON x.personIdentifier = p.personIdentifier - SET p.top10PercentileAll = x.pct, - p.top10RankAll = x.rank_count, - p.top10DenominatorAll = x.denominator; + SET p.top10RankFirstSenior = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10RankFirstSenior', 'INFO', v_rows, NULL); IF v_error_occurred THEN CALL log_progress(v_job_id, v_step, 'Failed', 'ERROR', NULL, v_error_message); diff --git a/setup/table_admin_permissions.sql b/setup/table_admin_permissions.sql new file mode 100644 index 0000000..c1d37af --- /dev/null +++ b/setup/table_admin_permissions.sql @@ -0,0 +1,56 @@ +-- Seed data for the data-driven RBAC permission tables. +-- Mirror of the SEED section of ReCiter-Publication-Manager +-- scripts/migrations/add-permission-tables.sql (3-places rule). +-- +-- Run ONCE per environment, after createDatabaseTableReciterDb.sql has created +-- the tables and table_admin_roles.sql has seeded admin_roles. The role->permission +-- seed joins on admin_roles.roleLabel, so it adapts to whatever roles an +-- environment defines; 'Curator_Scoped' is a harmless no-op where that role +-- does not exist. + +-- 1. Permissions (7) +INSERT INTO `admin_permissions` (`permissionKey`, `label`, `description`, `category`) VALUES + ('canCurate', 'Curate Publications', 'Accept or reject article suggestions for people', 'Curation'), + ('canSearch', 'Search Identities', 'Search and browse the identity directory', 'Navigation'), + ('canReport', 'Create Reports', 'Generate publication reports and export data', 'Reporting'), + ('canManageUsers', 'Manage Users', 'Create, edit, and deactivate user accounts and assign roles', 'Administration'), + ('canConfigure', 'Configuration', 'Edit application settings, labels, and field visibility', 'Administration'), + ('canManageNotifications', 'Manage Notifications', 'Configure notification preferences', 'Communication'), + ('canManageProfile', 'Manage Profile', 'View and edit user profile information', 'Profile'); + +-- 2. Role -> permission mappings (reproduces current behavior) +INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`) + SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap + WHERE ar.roleLabel = 'Superuser'; -- all 7 +INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`) + SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap + WHERE ar.roleLabel = 'Curator_All' AND ap.permissionKey IN ('canCurate','canSearch'); +INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`) + SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap + WHERE ar.roleLabel = 'Curator_Self' AND ap.permissionKey IN ('canCurate'); +INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`) + SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap + WHERE ar.roleLabel = 'Curator_Scoped' AND ap.permissionKey IN ('canCurate','canSearch'); +INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`) + SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap + WHERE ar.roleLabel = 'Curator_Department' AND ap.permissionKey IN ('canCurate','canSearch'); +INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`) + SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap + WHERE ar.roleLabel = 'Curator_Department_Delegate' AND ap.permissionKey IN ('canCurate','canSearch'); +INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`) + SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap + WHERE ar.roleLabel = 'Reporter_All' AND ap.permissionKey IN ('canReport','canSearch'); + +-- 3. Nav resources (sidebar items) +INSERT INTO `admin_permission_resources` (`permissionID`, `resourceType`, `resourceKey`, `displayOrder`, `icon`, `label`, `route`) + SELECT ap.permissionID, v.resourceType, v.resourceKey, v.displayOrder, v.icon, v.label, v.route + FROM admin_permissions ap + JOIN ( + SELECT 'canSearch' AS pk, 'nav' AS resourceType, 'nav_search' AS resourceKey, 1 AS displayOrder, 'Search' AS icon, 'Find People' AS label, '/search' AS route + UNION ALL SELECT 'canCurate', 'nav', 'nav_curate', 2, 'LocalLibrary', 'Curate Publications', '/curate' + UNION ALL SELECT 'canReport', 'nav', 'nav_report', 3, 'Assessment', 'Create Reports', '/report' + UNION ALL SELECT 'canManageNotifications', 'nav', 'nav_notifications', 4, 'NotificationsActive', 'Manage Notifications', '/notifications' + UNION ALL SELECT 'canManageProfile', 'nav', 'nav_profile', 5, 'AccountCircle', 'Manage Profile', '/manageprofile' + UNION ALL SELECT 'canManageUsers', 'nav', 'nav_users', 6, 'Group', 'Manage Users', '/manageusers' + UNION ALL SELECT 'canConfigure', 'nav', 'nav_config', 7, 'Settings', 'Configuration', '/configuration' + ) v ON ap.permissionKey = v.pk; diff --git a/setup/table_admin_roles.sql b/setup/table_admin_roles.sql new file mode 100644 index 0000000..50dcf8b --- /dev/null +++ b/setup/table_admin_roles.sql @@ -0,0 +1,9 @@ +LOCK TABLES `admin_roles` WRITE; +INSERT INTO `admin_roles` (`roleID`, `roleLabel`) VALUES + (1,'Superuser'), + (2,'Curator_All'), + (3,'Reporter_All'), + (4,'Curator_Self'), + (5,'Curator_Department'), + (6,'Curator_Department_Delegate'); +UNLOCK TABLES; \ No newline at end of file diff --git a/setup/table_authorship_review.sql b/setup/table_authorship_review.sql new file mode 100644 index 0000000..366986c --- /dev/null +++ b/setup/table_authorship_review.sql @@ -0,0 +1,82 @@ +-- ----------------------------------------------------------------------------- +-- authorship_review — Publication Manager "Authorships" review queue +-- ----------------------------------------------------------------------------- +-- DURABLE TABLE — survives the nightly truncate-reload. Like `grant_provenance` +-- (the (person,pmid,grant) audit log) and the `admin_*` tables, this is curator +-- state, NOT a reporting export. It MUST NOT be added to any truncate list +-- (see update/updateReciterDB.py `all_tables`) and is not touched by any nightly +-- stored procedure or ETL step. CREATE TABLE IF NOT EXISTS so re-applying is safe. +-- +-- One row per WCM-affiliated AUTHORSHIP (a PubMed author carrying a WCM affiliation +-- on a publication) that is NOT yet assigned to any identity. Powers the Curator_All +-- `/authorships` tab in ReCiter-Publication-Manager (reads this table via Sequelize). +-- +-- POPULATED EXTERNALLY (this repo's ETL cannot compute the scores). The producer is +-- the adversarial-attribution-review pipeline in the ReCiter Research project +-- (scripts/aar_orchestrator.py -> aar_db.py upsert), which runs the gate (reciterdb +-- analysis_summary_author = accepted set), the identity matcher (reciterdb identity), +-- and the pinned XGBoost 3.2.0 models over the S3 scoring inputs to compute the +-- feedback-identity (FG) and identity-only (IO) scores per authorship. Monthly cron. +-- +-- Classification per authorship (the producer sets it): +-- absent top candidate never scored by production (no person_article row) +-- suggested top candidate production final (FG) >= 30 — already in a pending queue +-- buried top candidate FG < 30 (IO can be high) — production buried it +-- assigned reserved (accepted rows are excluded by the gate, not stored here) +-- +-- single_candidate = exactly one WCM identity matches the author's surname + +-- given/initial (cohort_size == 1) — the strongest precision signal; such rows are +-- near-certain and form the high-precision review lane. +-- +-- Refresh contract: the producer UPSERTs by author_key, refreshing the scoring/ +-- classification columns and `last_refreshed`; it NEVER overwrites a curator-set +-- `status` (assigned/accepted/rejected/dismissed/snoozed) or its resolution_cwid/ +-- reviewer/note/snooze_until, and `first_seen` is set once and never overwritten. +-- ----------------------------------------------------------------------------- + +CREATE TABLE IF NOT EXISTS `authorship_review` ( + `id` BIGINT NOT NULL AUTO_INCREMENT, + `pmid` BIGINT NOT NULL, + `author_key` VARCHAR(32) NOT NULL, -- `pmid:position` + `author_position` INT NULL, + `author_position_label` VARCHAR(8) NULL, -- first/middle/last + `wcm_author` VARCHAR(255) NULL, -- PubMed author name + `author_affiliation` TEXT NULL, + `entrez_date` DATE NULL, -- ReCiter entrez add date + `title` TEXT NULL, + `journal` VARCHAR(512) NULL, + `doi` VARCHAR(255) NULL, + `classification` ENUM('assigned','suggested','buried','absent') NULL, + `top_cwid` VARCHAR(32) NULL, -- proposed identity + `top_name` VARCHAR(255) NULL, + `top_person_type` VARCHAR(64) NULL, + `top_dept` VARCHAR(255) NULL, + `top_fg_score` FLOAT NULL, -- production final (FG) + `top_io_score` FLOAT NULL, -- identity-only (IO) + `top_confidence` FLOAT NULL, + `top_cohort_size` INT NULL, -- homonyms (surname+initial) + `top_given_match` VARCHAR(16) NULL, -- full|initial + `top_affil_match` TINYINT(1) NULL, + `n_candidates` INT NULL, + `single_candidate` TINYINT(1) NULL, -- cohort_size == 1 + `candidate_cwids_json` LONGTEXT NULL, -- ranked alternates + `status` ENUM('open','assigned','accepted','rejected','dismissed','snoozed') + NOT NULL DEFAULT 'open', -- curator state + `resolution_cwid` VARCHAR(32) NULL, + `reviewer` VARCHAR(64) NULL, + `note` TEXT NULL, + `snooze_until` DATE NULL, + `resolved_at` DATETIME NULL, + `first_seen` DATETIME NULL, -- set once, never overwritten + `last_refreshed` DATETIME NULL, + `last_checked` DATETIME NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `uq_author_key` (`author_key`), + KEY `ix_pmid` (`pmid`), + KEY `ix_classification` (`classification`), + KEY `ix_status` (`status`), + KEY `ix_single_candidate` (`single_candidate`), + KEY `ix_top_io_score` (`top_io_score`), + KEY `ix_entrez_date` (`entrez_date`), + KEY `ix_top_cwid` (`top_cwid`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; diff --git a/update/abstractImport.py b/update/abstractImport.py index 867b538..8057b24 100644 --- a/update/abstractImport.py +++ b/update/abstractImport.py @@ -1,10 +1,10 @@ # abstractImport.py import boto3 -import csv import logging import pymysql.cursors import pymysql.err +import random import sys import time import os @@ -20,6 +20,10 @@ ) logger = logging.getLogger(__name__) +# Quiet botocore's per-call credential/endpoint chatter so pipeline logs stay readable. +logging.getLogger("botocore").setLevel(logging.WARNING) +logging.getLogger("boto3").setLevel(logging.WARNING) + # ------------------------------------------------------------------------------ # Environment Variables # ------------------------------------------------------------------------------ @@ -28,15 +32,31 @@ DB_HOST = os.getenv("DB_HOST") DB_NAME = os.getenv("DB_NAME") -# DynamoDB concurrency settings -CHUNK_SIZE = 100 # Max items per batch_get_item call -MAX_WORKERS = 5 # Number of threads for parallel fetching +# ------------------------------------------------------------------------------ +# Settings +# ------------------------------------------------------------------------------ +# DynamoDB fetch +CHUNK_SIZE = 100 # Max keys per batch_get_item call (DynamoDB hard limit) +MAX_WORKERS = 5 # Threads for parallel fetching +MAX_UNPROCESSED_RETRIES = 8 # Backoff retries for keys DynamoDB reports as unprocessed + +# Insert +INSERT_BATCH_SIZE = 200 # Rows per executemany batch (kept well under max_allowed_packet) + +# Loop safety +MAX_CYCLES = 25 # Hard cap on fetch/insert cycles; a healthy run needs 1-2 + +# Dry run +DRY_RUN = "--dry-run" in sys.argv +DRY_RUN_SAMPLE = 500 # PMIDs processed when --dry-run is passed +DRY_RUN_TABLE = "reporting_abstracts_dryrun" + # ------------------------------------------------------------------------------ # Database Connection # ------------------------------------------------------------------------------ def connect_mysql_server(db_user, db_pass, db_host, db_name): - """Function to connect to MySQL database""" + """Connect to the MariaDB database.""" try: mysql_db = pymysql.connect( user=db_user, @@ -44,7 +64,7 @@ def connect_mysql_server(db_user, db_pass, db_host, db_name): database=db_name, host=db_host, autocommit=True, - local_infile=True, + charset="utf8mb4", cursorclass=pymysql.cursors.DictCursor ) logger.info(f"Connected to database server: {db_host}, database: {db_name}, user: {db_user}") @@ -53,13 +73,14 @@ def connect_mysql_server(db_user, db_pass, db_host, db_name): logger.error(f"{time.ctime()} -- Error connecting to the database: {err}") sys.exit(1) + # ------------------------------------------------------------------------------ # Fetch All Missing PMIDs # ------------------------------------------------------------------------------ def fetch_missing_pmids(mysql_conn): """ - Returns a list of all PMIDs that exist in analysis_summary_article - but do NOT exist in reporting_abstracts. + Returns every PMID that exists in analysis_summary_article but has no + matching row in reporting_abstracts. """ sql = """ SELECT DISTINCT p.pmid AS pmid @@ -69,16 +90,17 @@ def fetch_missing_pmids(mysql_conn): """ with mysql_conn.cursor() as cursor: cursor.execute(sql) - rows = cursor.fetchall() - return [row["pmid"] for row in rows] + return [row["pmid"] for row in cursor.fetchall()] + # ------------------------------------------------------------------------------ # Extract Abstract Text # ------------------------------------------------------------------------------ def get_abstract(item): """ - Extracts the abstract text from a DynamoDB item representing a PubMed article. - Handles labeled abstract segments if present. + Extracts the abstract text from a DynamoDB item representing a PubMed + article. Handles labeled abstract segments. Returns "" when no abstract + is present. """ medline_citation = item.get("pubmedarticle", {}).get("medlinecitation") if not medline_citation: @@ -102,125 +124,237 @@ def get_abstract(item): return " ".join(abstract_texts) if abstract_texts else "" + # ------------------------------------------------------------------------------ -# Batch Fetch Abstracts from DynamoDB +# Fetch Abstracts from DynamoDB # ------------------------------------------------------------------------------ def fetch_abstracts_for_chunk(chunk_pmids): """ - Performs a single batch_get_item call for the given chunk of PMIDs. - Returns a list of (pmid, abstract_text) pairs. + Fetches one chunk of PMIDs from DynamoDB via batch_get_item. Any keys that + DynamoDB reports as unprocessed (throttling) are retried with exponential + backoff so they are not silently lost. Returns (pmid, abstract) pairs. """ - dynamodb = boto3.resource("dynamodb") - client = dynamodb.meta.client + client = boto3.resource("dynamodb").meta.client + + request_keys = [{"pmid": pmid} for pmid in chunk_pmids] + results = [] + attempt = 0 - # Prepare Keys for batch_get_item - keys = [{"pmid": pmid} for pmid in chunk_pmids] + while request_keys: + response = client.batch_get_item( + RequestItems={"PubMedArticle": {"Keys": request_keys}} + ) - # Perform batch_get_item - response = client.batch_get_item( - RequestItems={ - "PubMedArticle": {"Keys": keys} - } - ) + for item in response["Responses"].get("PubMedArticle", []): + pmid = item.get("pmid") + if pmid is not None: + results.append((pmid, get_abstract(item))) - items = response["Responses"].get("PubMedArticle", []) - results = [] - for item in items: - pmid = item.get("pmid") - if pmid: - abstract_text = get_abstract(item) - results.append((pmid, abstract_text)) + request_keys = ( + response.get("UnprocessedKeys", {}) + .get("PubMedArticle", {}) + .get("Keys", []) + ) + if request_keys: + attempt += 1 + if attempt > MAX_UNPROCESSED_RETRIES: + logger.warning( + f"{len(request_keys)} key(s) still unprocessed after " + f"{MAX_UNPROCESSED_RETRIES} retries; skipping this chunk's remainder." + ) + break + time.sleep(min(0.1 * (2 ** attempt), 5.0)) return results + +def fetch_all_abstracts(pmids): + """Fetches abstracts for all given PMIDs from DynamoDB in parallel.""" + chunks = [pmids[i:i + CHUNK_SIZE] for i in range(0, len(pmids), CHUNK_SIZE)] + logger.info(f"Created {len(chunks)} chunk(s). Each chunk up to {CHUNK_SIZE} PMIDs.") + + all_results = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: + futures = {executor.submit(fetch_abstracts_for_chunk, c): c for c in chunks} + for future in concurrent.futures.as_completed(futures): + try: + all_results.extend(future.result()) + except Exception as e: + logger.exception(f"Error fetching chunk: {e}") + return all_results + + # ------------------------------------------------------------------------------ -# Bulk-Load a Single CSV into reporting_abstracts +# Insert Abstracts # ------------------------------------------------------------------------------ -def load_csv_into_reporting_abstracts(mysql_conn, csv_path): +def insert_abstracts(mysql_conn, results, target_table="reporting_abstracts"): + """ + Inserts (pmid, abstract) pairs with a parameterized, batched INSERT. + + pymysql binds every value as a query parameter, so abstracts containing + double quotes, tabs, newlines or backslashes are stored verbatim. The + previous CSV + LOAD DATA INFILE path could not parse such content and + silently dropped the affected rows. + """ + if not results: + logger.info("No abstracts to insert.") + return 0 + + insert_sql = f"INSERT INTO {target_table} (pmid, abstract) VALUES (%s, %s)" + inserted = 0 with mysql_conn.cursor() as cursor: - cwd = os.getcwd() - full_csv_path = os.path.join(cwd, csv_path).replace("\\", "/") # Ensure correct path format - - load_query = ( - "LOAD DATA LOCAL INFILE '{path}' " - "INTO TABLE reporting_abstracts " - "FIELDS TERMINATED BY '\t' ENCLOSED BY '\"' " - "LINES TERMINATED BY '\n' " - "IGNORE 1 LINES (pmid, abstract);" - ).format(path=full_csv_path) - - cursor.execute(load_query) - logger.info(f"{time.ctime()} -- {csv_path} loaded into reporting_abstracts.") - - update_query = ( - "UPDATE reporting_abstracts " - "SET abstractVarchar = CAST(abstract AS CHAR(15000)) " - "WHERE abstractVarchar IS NULL;" + for i in range(0, len(results), INSERT_BATCH_SIZE): + batch = results[i:i + INSERT_BATCH_SIZE] + cursor.executemany(insert_sql, batch) + inserted += len(batch) + logger.info(f"{time.ctime()} -- Inserted {inserted} row(s) into {target_table}.") + + cursor.execute( + f"UPDATE {target_table} " + f"SET abstractVarchar = CAST(abstract AS CHAR(15000)) " + f"WHERE abstractVarchar IS NULL" ) - cursor.execute(update_query) - logger.info(f"{time.ctime()} -- reporting_abstracts updated with varchar equivalents.") + logger.info(f"{time.ctime()} -- {target_table} updated with varchar equivalents.") + return inserted + # ------------------------------------------------------------------------------ -# Main Script Logic +# Dry Run +# ------------------------------------------------------------------------------ +def run_dry_run(mysql_conn): + """ + Verifies the fetch -> insert path end to end without modifying + reporting_abstracts: a random sample of missing PMIDs is processed into a + session-private TEMPORARY table, then verified and discarded. + """ + logger.info("=== DRY RUN === reporting_abstracts will NOT be modified.") + + all_pmids = fetch_missing_pmids(mysql_conn) + logger.info(f"{len(all_pmids)} PMID(s) currently missing abstracts in production.") + if not all_pmids: + logger.info("Nothing missing; no sample to process.") + mysql_conn.close() + return + + sample = random.sample(all_pmids, min(DRY_RUN_SAMPLE, len(all_pmids))) + logger.info(f"Processing a random sample of {len(sample)} PMID(s) through the new insert path.") + + try: + with mysql_conn.cursor() as cursor: + cursor.execute(f"CREATE TEMPORARY TABLE {DRY_RUN_TABLE} LIKE reporting_abstracts") + + all_results = fetch_all_abstracts(sample) + logger.info(f"Fetched {len(all_results)} item(s) from DynamoDB (requested {len(sample)}).") + if not all_results: + logger.error("DRY RUN FAILED: DynamoDB returned nothing for the sample.") + return + + poison = [ + (p, a) for p, a in all_results + if a and any(c in a for c in ('"', '\t', '\n', '\r', '\\')) + ] + logger.info( + f"{len(poison)} of {len(all_results)} fetched abstracts contain " + f"quotes/tabs/newlines/backslashes -- the content the old LOAD DATA " + f"path silently dropped." + ) + if poison: + logger.info(f"Example poison abstract (PMID {poison[0][0]}): {poison[0][1][:160]!r}") + + inserted = insert_abstracts(mysql_conn, all_results, target_table=DRY_RUN_TABLE) + + with mysql_conn.cursor() as cursor: + cursor.execute( + f"SELECT COUNT(*) c, COUNT(DISTINCT pmid) d, " + f"SUM(pmid IS NULL) nullp, SUM(abstractVarchar IS NULL) nullv " + f"FROM {DRY_RUN_TABLE}" + ) + stats = cursor.fetchone() + + counts_ok = ( + stats["c"] == len(all_results) + and not stats["nullp"] + and not stats["nullv"] + ) + + # Content-integrity check: re-read the longest poison abstract verbatim. + integrity_ok = True + if poison: + worst_pmid, worst_abs = max(poison, key=lambda x: len(x[1])) + with mysql_conn.cursor() as cursor: + cursor.execute( + f"SELECT abstract FROM {DRY_RUN_TABLE} WHERE pmid = %s", (worst_pmid,) + ) + stored = cursor.fetchone()["abstract"] + if isinstance(stored, bytes): + stored = stored.decode("utf-8") + integrity_ok = (stored == worst_abs) + logger.info( + f"Content-integrity check on PMID {worst_pmid} " + f"({len(worst_abs)} chars, contains poison characters): " + f"{'MATCH' if integrity_ok else 'MISMATCH'}" + ) + + if counts_ok and integrity_ok: + logger.info( + f"DRY RUN PASSED -- {inserted} row(s) inserted; {stats['c']} present; " + f"{stats['d']} distinct PMIDs; 0 NULL pmids; 0 NULL abstractVarchar; " + f"content stored verbatim." + ) + else: + logger.error( + f"DRY RUN FAILED -- rows={stats['c']} (expected {len(all_results)}); " + f"null_pmid={stats['nullp']}; null_varchar={stats['nullv']}; " + f"integrity_ok={integrity_ok}" + ) + finally: + with mysql_conn.cursor() as cursor: + cursor.execute(f"DROP TEMPORARY TABLE IF EXISTS {DRY_RUN_TABLE}") + logger.info(f"Scratch table {DRY_RUN_TABLE} dropped.") + mysql_conn.close() + + +# ------------------------------------------------------------------------------ +# Main # ------------------------------------------------------------------------------ def main(): - # 1) Connect to MySQL mysql_conn = connect_mysql_server(DB_USERNAME, DB_PASSWORD, DB_HOST, DB_NAME) - while True: - # 2) Fetch all missing PMIDs + if DRY_RUN: + run_dry_run(mysql_conn) + return + + prev_missing = None + for cycle in range(1, MAX_CYCLES + 1): all_pmids = fetch_missing_pmids(mysql_conn) if not all_pmids: logger.info("No more missing abstracts. We are done.") break - logger.info(f"Found {len(all_pmids)} PMIDs needing abstracts.") - - # 3) Remove any existing abstract.csv - csv_path = "abstract.csv" - if os.path.exists(csv_path): - os.remove(csv_path) + logger.info(f"Cycle {cycle}: found {len(all_pmids)} PMID(s) needing abstracts.") - # 4) Chunk the PMIDs - chunks = [ - all_pmids[i : i + CHUNK_SIZE] - for i in range(0, len(all_pmids), CHUNK_SIZE) - ] - logger.info(f"Created {len(chunks)} chunk(s). Each chunk up to {CHUNK_SIZE} PMIDs.") - - # Accumulate all results in memory for this iteration - all_results = [] - - # 5) Parallel fetch from DynamoDB - with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: - future_to_chunk = { - executor.submit(fetch_abstracts_for_chunk, c): c for c in chunks - } - for future in concurrent.futures.as_completed(future_to_chunk): - try: - chunk_result = future.result() - all_results.extend(chunk_result) - except Exception as e: - logger.exception(f"Error fetching chunk: {e}") - - logger.info(f"Fetched abstracts for {len(all_results)} PMIDs in this cycle.") - - # 6) Write to CSV - with open(csv_path, "w", newline="", encoding="utf-8") as f: - writer = csv.writer(f, delimiter="\t") - writer.writerow(["pmid", "abstract"]) - for pmid, abstract_text in all_results: - writer.writerow([pmid, abstract_text]) - - # 7) Load CSV into DB - load_csv_into_reporting_abstracts(mysql_conn, csv_path) - - # We then loop again in case there are additional PMIDs that - # appeared or newly became missing. Usually, you won't see more, - # but if your data is updated behind the scenes, it handles that too. + # Safety net: if a cycle does not reduce the missing count, the + # remaining PMIDs cannot be resolved (no DynamoDB record). Stop rather + # than loop forever -- the failure mode that hung the nightly pipeline. + if prev_missing is not None and len(all_pmids) >= prev_missing: + logger.warning( + f"No progress since the previous cycle ({len(all_pmids)} PMID(s) " + f"still missing); stopping. These PMIDs have no retrievable abstract." + ) + break + prev_missing = len(all_pmids) + + all_results = fetch_all_abstracts(all_pmids) + logger.info(f"Fetched abstracts for {len(all_results)} PMID(s) from DynamoDB.") + insert_abstracts(mysql_conn, all_results) + else: + logger.warning( + f"Reached the {MAX_CYCLES}-cycle safety limit with abstracts still " + f"missing; stopping. A healthy run converges in 1-2 cycles -- investigate." + ) mysql_conn.close() - logger.info("All missing abstracts have now been imported.") + logger.info("Abstract import complete.") if __name__ == "__main__": diff --git a/update/auditAbstracts.py b/update/auditAbstracts.py new file mode 100644 index 0000000..b56dbcb --- /dev/null +++ b/update/auditAbstracts.py @@ -0,0 +1,318 @@ +""" +auditAbstracts.py -- one-shot forensic audit of reporting_abstracts. + +Pulls rows where LENGTH(abstract) >= AUDIT_LENGTH_THRESHOLD, fetches the +DynamoDB ground truth for each PMID via the same path abstractImport.py +uses, and classifies each row: + + CLEAN DB matches Dynamo (long but legitimate abstract). + PREFIX_CORRUPTED First ~150 chars of the Dynamo abstract appear near + the start of the DB blob and DB is substantially + longer than Dynamo -- the cross-paper concatenation + pattern produced by the old CSV / LOAD DATA path. + DISJOINT DB front does not match Dynamo front; needs manual + review. + MISSING_IN_DYNAMO DynamoDB has no PubMedArticle record for the PMID. + EMPTY_IN_DYNAMO Record present but yields empty abstract. + +Outputs: + - audit_abstracts.csv one row per PMID examined + - audit_abstracts_dump.txt full text dump of the top N corrupted rows + - per-verdict counters and worst-offender summary to stdout + +Read-only. Does not modify reporting_abstracts. + +Env: + DB_USERNAME, DB_PASSWORD, DB_HOST, DB_NAME + AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_DEFAULT_REGION + AUDIT_LENGTH_THRESHOLD (default 4000) + AUDIT_MAX_CANDIDATES (default 1000) +""" + +import concurrent.futures +import csv +import logging +import os +import sys +import time + +import boto3 +import pymysql.cursors +import pymysql.err + + +DB_USERNAME = os.getenv("DB_USERNAME") +DB_PASSWORD = os.getenv("DB_PASSWORD") +DB_HOST = os.getenv("DB_HOST") +DB_NAME = os.getenv("DB_NAME") + +LENGTH_THRESHOLD = int(os.getenv("AUDIT_LENGTH_THRESHOLD", "4000")) +MAX_CANDIDATES = int(os.getenv("AUDIT_MAX_CANDIDATES", "1000")) + +CHUNK_SIZE = 100 +MAX_WORKERS = 5 +MAX_UNPROCESSED_RETRIES = 8 + +OUTPUT_CSV = "audit_abstracts.csv" +DUMP_FILE = "audit_abstracts_dump.txt" +DUMP_TOP_N = 5 + +# Compare on the first HEAD_SAMPLE chars of the Dynamo abstract; require +# it to be found within the first HEAD_SEARCH_WINDOW chars of the DB blob. +# Short enough to tolerate leading-character noise (the orphan `"` and +# similar CSV artifacts), long enough to be specific. +HEAD_SAMPLE = 150 +HEAD_SEARCH_WINDOW = 400 +# A DB blob this much longer than Dynamo is the concatenation signal. +LENGTH_INFLATION_RATIO = 1.3 +# BLOB-cap rule: a row right at the column cap with a Dynamo abstract many +# times smaller is the parser-desync fingerprint regardless of whether the +# first 150 chars happen to match (PubMed sometimes updated section labels +# between the original CSV load and now, which can defeat the head-string +# match). +BLOB_CAP_THRESHOLD = 60000 +BLOB_CAP_INFLATION_RATIO = 5 + +logging.basicConfig( + stream=sys.stdout, + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger(__name__) +logging.getLogger("botocore").setLevel(logging.WARNING) +logging.getLogger("boto3").setLevel(logging.WARNING) + + +def connect_mysql(): + try: + return pymysql.connect( + user=DB_USERNAME, + password=DB_PASSWORD, + database=DB_NAME, + host=DB_HOST, + autocommit=True, + charset="utf8mb4", + cursorclass=pymysql.cursors.DictCursor, + ) + except pymysql.err.MySQLError as err: + logger.error(f"DB connection failed: {err}") + sys.exit(1) + + +def fetch_candidates(conn, threshold, max_rows): + sql = """ + SELECT pmid, LENGTH(abstract) AS db_len, abstract + FROM reporting_abstracts + WHERE LENGTH(abstract) >= %s + ORDER BY LENGTH(abstract) DESC + LIMIT %s + """ + with conn.cursor() as cur: + cur.execute(sql, (threshold, max_rows)) + rows = cur.fetchall() + for r in rows: + if isinstance(r["abstract"], (bytes, bytearray)): + r["abstract"] = r["abstract"].decode("utf-8", errors="replace") + r["abstract"] = r["abstract"].replace("\r\n", "\n") + return rows + + +def get_abstract(item): + """Same extraction logic as update/abstractImport.py:99.""" + medline_citation = item.get("pubmedarticle", {}).get("medlinecitation") + if not medline_citation: + return "" + article = medline_citation.get("article") + if not article: + return "" + publication_abstract = article.get("publicationAbstract") + if not publication_abstract: + return "" + abstract_texts = [] + for abstract_part in publication_abstract.get("abstractTexts", []): + label = abstract_part.get("abstractTextLabel") + text = abstract_part.get("abstractText") + if text: + label_text = f"{label}: " if label else "" + abstract_texts.append(label_text + text) + return " ".join(abstract_texts) if abstract_texts else "" + + +def fetch_abstracts_from_dynamo(pmids): + client = boto3.resource("dynamodb").meta.client + + def fetch_chunk(chunk): + request_keys = [{"pmid": p} for p in chunk] + results = {} + present = set() + attempt = 0 + while request_keys: + response = client.batch_get_item( + RequestItems={"PubMedArticle": {"Keys": request_keys}} + ) + for item in response["Responses"].get("PubMedArticle", []): + pmid = item.get("pmid") + if pmid is not None: + present.add(pmid) + results[pmid] = get_abstract(item) + request_keys = ( + response.get("UnprocessedKeys", {}) + .get("PubMedArticle", {}) + .get("Keys", []) + ) + if request_keys: + attempt += 1 + if attempt > MAX_UNPROCESSED_RETRIES: + logger.warning( + f"{len(request_keys)} keys still unprocessed after " + f"{MAX_UNPROCESSED_RETRIES} retries; skipping remainder." + ) + break + time.sleep(min(0.1 * (2 ** attempt), 5.0)) + return results, present + + chunks = [pmids[i:i + CHUNK_SIZE] for i in range(0, len(pmids), CHUNK_SIZE)] + all_results = {} + found = set() + with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex: + futures = [ex.submit(fetch_chunk, c) for c in chunks] + for f in concurrent.futures.as_completed(futures): + res, present = f.result() + all_results.update(res) + found.update(present) + return all_results, found + + +def classify(db_abs, dyn_abs, dyn_present): + if not dyn_present: + return "MISSING_IN_DYNAMO" + if not dyn_abs: + return "EMPTY_IN_DYNAMO" + + db_norm = db_abs.strip() + dyn_norm = dyn_abs.strip() + if db_norm == dyn_norm: + return "CLEAN" + + db_len = len(db_norm) + dyn_len = len(dyn_norm) + + # Allow tiny tail differences (trailing whitespace/punctuation, an + # extra character or two) without flagging as corruption. + if abs(db_len - dyn_len) <= 5 and db_norm[: min(db_len, dyn_len) - 5 if db_len > 5 else db_len].lstrip('"') == dyn_norm[: min(db_len, dyn_len) - 5 if db_len > 5 else dyn_len].lstrip('"'): + return "CLEAN" + + head_sample = dyn_norm[:HEAD_SAMPLE] + if head_sample and head_sample in db_norm[:HEAD_SEARCH_WINDOW]: + if db_len > dyn_len * LENGTH_INFLATION_RATIO: + return "PREFIX_CORRUPTED" + return "CLEAN" + + if db_len >= BLOB_CAP_THRESHOLD and db_len > dyn_len * BLOB_CAP_INFLATION_RATIO: + return "PREFIX_CORRUPTED" + + return "DISJOINT" + + +def safe_oneline(s, n): + return s[:n].replace("\n", " ").replace("\t", " ").replace("\r", " ") + + +def main(): + logger.info( + f"Audit: LENGTH(abstract) >= {LENGTH_THRESHOLD}; " + f"max candidates: {MAX_CANDIDATES}" + ) + conn = connect_mysql() + try: + candidates = fetch_candidates(conn, LENGTH_THRESHOLD, MAX_CANDIDATES) + finally: + conn.close() + + logger.info(f"Candidates from reporting_abstracts: {len(candidates)}") + if not candidates: + logger.info("Nothing above threshold; exiting.") + return + + lens = sorted(c["db_len"] for c in candidates) + logger.info( + f"DB length distribution: min={lens[0]} " + f"p50={lens[len(lens) // 2]} p95={lens[int(len(lens) * 0.95)]} " + f"max={lens[-1]}" + ) + + pmids = [c["pmid"] for c in candidates] + dyn_abstracts, dyn_present = fetch_abstracts_from_dynamo(pmids) + logger.info( + f"DynamoDB returned records for {len(dyn_present)} / {len(pmids)} PMIDs" + ) + + rows = [] + counters = { + "CLEAN": 0, + "PREFIX_CORRUPTED": 0, + "DISJOINT": 0, + "MISSING_IN_DYNAMO": 0, + "EMPTY_IN_DYNAMO": 0, + } + for c in candidates: + pmid = c["pmid"] + db_abs = c["abstract"] + present = pmid in dyn_present + dyn_abs = dyn_abstracts.get(pmid, "") + verdict = classify(db_abs, dyn_abs, present) + counters[verdict] += 1 + rows.append({ + "pmid": pmid, + "db_len": c["db_len"], + "dyn_len": len(dyn_abs) if present else "", + "verdict": verdict, + "db_head": safe_oneline(db_abs, 80), + "db_tail": safe_oneline(db_abs[-80:] if len(db_abs) >= 80 else db_abs, 80), + "dyn_head": safe_oneline(dyn_abs, 80) if present else "", + }) + + with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=list(rows[0].keys())) + writer.writeheader() + writer.writerows(rows) + logger.info(f"Per-row audit written to {OUTPUT_CSV}") + + logger.info("Verdict counts:") + for k in ("CLEAN", "PREFIX_CORRUPTED", "DISJOINT", + "MISSING_IN_DYNAMO", "EMPTY_IN_DYNAMO"): + logger.info(f" {k:18s} {counters[k]}") + + suspect = [r for r in rows if r["verdict"] in ("PREFIX_CORRUPTED", "DISJOINT")] + suspect.sort(key=lambda r: r["db_len"], reverse=True) + + if suspect: + with open(DUMP_FILE, "w", encoding="utf-8") as f: + for r in suspect[:DUMP_TOP_N]: + pmid = r["pmid"] + db_abs = next(c["abstract"] for c in candidates if c["pmid"] == pmid) + dyn_abs = dyn_abstracts.get(pmid, "") + f.write("=" * 80 + "\n") + f.write( + f"pmid={pmid} verdict={r['verdict']} " + f"db_len={r['db_len']} dyn_len={r['dyn_len']}\n" + ) + f.write("--- DB (full) ---\n") + f.write(db_abs + "\n") + f.write("--- Dynamo (full) ---\n") + f.write(dyn_abs + "\n\n") + logger.info(f"Top {DUMP_TOP_N} suspects dumped to {DUMP_FILE}") + + logger.info(f"Top {min(DUMP_TOP_N, len(suspect))} suspects (summary):") + for r in suspect[:DUMP_TOP_N]: + logger.info( + f" pmid={r['pmid']:>9} verdict={r['verdict']:17s} " + f"db_len={r['db_len']:>6} dyn_len={r['dyn_len']}" + ) + logger.info(f" db_head : {r['db_head']!r}") + logger.info(f" db_tail : {r['db_tail']!r}") + logger.info(f" dyn_head : {r['dyn_head']!r}") + + +if __name__ == "__main__": + main() diff --git a/update/dataTransformer.py b/update/dataTransformer.py index e9328b8..f89d502 100644 --- a/update/dataTransformer.py +++ b/update/dataTransformer.py @@ -227,13 +227,15 @@ def process_person_article(items, output_path): "scopusNonTargetAuthorInstitutionalAffiliationSource", "scopusNonTargetAuthorInstitutionalAffiliationScore", - "datePublicationAddedToEntrez", "doi", + "datePublicationAddedToEntrez", "datePublicationAddedToPMC", "doi", "issn", "issue", "journalTitleISOabbreviation", "pages", "timesCited", "volume", - + "feedbackScoreCites", "feedbackScoreCoAuthorName", "feedbackScoreEmail", "feedbackScoreInstitution", "feedbackScoreJournal", "feedbackScoreJournalSubField", "feedbackScoreKeyword", "feedbackScoreOrcid", "feedbackScoreOrcidCoAuthor", "feedbackScoreOrganization", "feedbackScoreTargetAuthorName", "feedbackScoreYear", + "feedbackScoreTextSimilarity", "feedbackScoreJournalTitleSimilarity", + "feedbackScoreBibliographicCoupling", "totalArticleScoreStandardized", "totalArticleScoreNonStandardized" ] @@ -398,6 +400,7 @@ def process_person_article(items, output_path): # Additional fields date_publication_added_to_entrez = sanitize_field(article.get('datePublicationAddedToEntrez', '')) + date_publication_added_to_pmc = sanitize_field(article.get('datePublicationAddedToPMC', '')) doi = sanitize_field(article.get('doi', '')) issn_list = article.get('issn', []) issn = '' @@ -432,11 +435,13 @@ def process_person_article(items, output_path): 'feedbackScoreCites', 'feedbackScoreCoAuthorName', 'feedbackScoreEmail', 'feedbackScoreInstitution', 'feedbackScoreJournal', 'feedbackScoreJournalSubField', 'feedbackScoreKeyword', 'feedbackScoreOrcid', 'feedbackScoreOrcidCoAuthor', - 'feedbackScoreOrganization', 'feedbackScoreTargetAuthorName', 'feedbackScoreYear' + 'feedbackScoreOrganization', 'feedbackScoreTargetAuthorName', 'feedbackScoreYear', + 'feedbackScoreTextSimilarity', 'feedbackScoreJournalTitleSimilarity', + 'feedbackScoreBibliographicCoupling' ] ] else: - feedback_scores = [''] * 12 # Assuming 12 feedback scores + feedback_scores = [''] * 15 # Assuming 15 feedback scores total_article_score_standardized = sanitize_field(article.get('totalArticleScoreStandardized', '')) total_article_score_non_standardized = sanitize_field(article.get('totalArticleScoreNonStandardized', '')) @@ -508,6 +513,7 @@ def process_person_article(items, output_path): "scopusNonTargetAuthorInstitutionalAffiliationScore": scopus_non_target_author_institutional_affiliation_score, "datePublicationAddedToEntrez": date_publication_added_to_entrez, + "datePublicationAddedToPMC": date_publication_added_to_pmc, "doi": doi, "issn": issn, "issue": issue, @@ -529,7 +535,10 @@ def process_person_article(items, output_path): "feedbackScoreOrganization": feedback_scores[9], "feedbackScoreTargetAuthorName": feedback_scores[10], "feedbackScoreYear": feedback_scores[11], - + "feedbackScoreTextSimilarity": feedback_scores[12], + "feedbackScoreJournalTitleSimilarity": feedback_scores[13], + "feedbackScoreBibliographicCoupling": feedback_scores[14], + "totalArticleScoreStandardized": total_article_score_standardized, "totalArticleScoreNonStandardized": total_article_score_non_standardized } diff --git a/update/repairAbstracts.py b/update/repairAbstracts.py new file mode 100644 index 0000000..3f9466d --- /dev/null +++ b/update/repairAbstracts.py @@ -0,0 +1,358 @@ +""" +repairAbstracts.py -- one-shot cleanup of reporting_abstracts rows flagged +as corrupted by update/auditAbstracts.py. + +Reads audit_abstracts.csv (the audit output) and: + 1. Backs up the affected rows to reporting_abstracts_corrupt_backup_. + 2. Deletes the corrupted rows from reporting_abstracts in batches. + 3. Dedupes any remaining pmids that have multiple rows by keeping the + row with MIN(id) and backing up the rest to the same backup table. + (Precondition for the v1.4 UNIQUE KEY migration.) + 4. Verifies post-state row counts and confirms no duplicate pmids remain. + +After this script runs, the next nightly update/abstractImport.py will +re-fetch the deleted PMIDs cleanly via the parameterized executemany +path introduced in PR #78. + +Destructive. Requires --apply to perform the delete; without --apply it +runs in dry-run mode (counts only, no writes). + +Env: + DB_USERNAME, DB_PASSWORD, DB_HOST, DB_NAME +""" + +import argparse +import csv +import datetime +import logging +import os +import re +import sys + +import pymysql.cursors +import pymysql.err + + +INVALID_VERDICTS = {"PREFIX_CORRUPTED", "DISJOINT", "EMPTY_IN_DYNAMO"} +DEFAULT_AUDIT_CSV = "audit_abstracts.csv" +DEFAULT_BATCH_SIZE = 500 + +# Identifier safety: the backup-table suffix is timestamp-derived, but +# allow callers to override with --backup-table; whitelist the shape to +# refuse anything that would require quoting. +SAFE_IDENT = re.compile(r"^[A-Za-z_][A-Za-z0-9_]{0,63}$") + +logging.basicConfig( + stream=sys.stdout, + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger(__name__) + + +def connect_mysql(): + try: + return pymysql.connect( + user=os.getenv("DB_USERNAME"), + password=os.getenv("DB_PASSWORD"), + database=os.getenv("DB_NAME"), + host=os.getenv("DB_HOST"), + autocommit=True, + charset="utf8mb4", + cursorclass=pymysql.cursors.DictCursor, + ) + except pymysql.err.MySQLError as err: + logger.error(f"DB connection failed: {err}") + sys.exit(1) + + +def read_invalid_pmids(audit_csv): + if not os.path.exists(audit_csv): + logger.error(f"Audit CSV not found: {audit_csv}") + logger.error("Run update/auditAbstracts.py first.") + sys.exit(1) + with open(audit_csv) as f: + rows = list(csv.DictReader(f)) + if not rows or "verdict" not in rows[0] or "pmid" not in rows[0]: + logger.error(f"{audit_csv} is missing required columns (pmid, verdict).") + sys.exit(1) + return sorted({ + int(r["pmid"]) for r in rows if r["verdict"] in INVALID_VERDICTS + }) + + +def count_matching(cur, pmids, batch=5000): + """COUNT(*) of rows whose pmid is in `pmids`, batched to avoid + oversized IN-lists. Returns the sum across batches.""" + total = 0 + for i in range(0, len(pmids), batch): + chunk = pmids[i:i + batch] + placeholders = ",".join(["%s"] * len(chunk)) + cur.execute( + f"SELECT COUNT(*) AS c FROM reporting_abstracts " + f"WHERE pmid IN ({placeholders})", + chunk, + ) + total += cur.fetchone()["c"] + return total + + +def writable_columns(cur, table="reporting_abstracts"): + """Return the list of non-generated columns (those that accept INSERT). + Prod has a STORED generated column abstract_len that cannot be assigned; + INSERT must enumerate the real columns explicitly.""" + cur.execute( + "SELECT column_name FROM information_schema.columns " + "WHERE table_schema = DATABASE() AND table_name = %s " + " AND (extra IS NULL OR extra NOT LIKE '%%GENERATED%%') " + "ORDER BY ordinal_position", + (table,), + ) + return [r["column_name"] for r in cur.fetchall()] + + +def backup_rows(cur, pmids, backup_table, batch): + cur.execute(f"CREATE TABLE `{backup_table}` LIKE reporting_abstracts") + cols = writable_columns(cur) + col_list = ", ".join(f"`{c}`" for c in cols) + inserted = 0 + for i in range(0, len(pmids), batch): + chunk = pmids[i:i + batch] + placeholders = ",".join(["%s"] * len(chunk)) + cur.execute( + f"INSERT INTO `{backup_table}` ({col_list}) " + f"SELECT {col_list} FROM reporting_abstracts WHERE pmid IN ({placeholders})", + chunk, + ) + inserted += cur.rowcount + if (i // batch) % 5 == 0: + logger.info(f" ... backed up {inserted:,} rows") + return inserted + + +def delete_rows(cur, pmids, batch): + deleted = 0 + for i in range(0, len(pmids), batch): + chunk = pmids[i:i + batch] + placeholders = ",".join(["%s"] * len(chunk)) + cur.execute( + f"DELETE FROM reporting_abstracts WHERE pmid IN ({placeholders})", + chunk, + ) + deleted += cur.rowcount + if (i // batch) % 5 == 0: + logger.info(f" ... deleted {deleted:,} rows") + return deleted + + +def find_duplicate_pmids(cur, limit=10): + cur.execute( + "SELECT pmid, COUNT(*) AS c FROM reporting_abstracts " + "GROUP BY pmid HAVING c > 1 LIMIT %s", + (limit,), + ) + return cur.fetchall() + + +def count_duplicate_extras(cur): + """Returns (group_count, extra_row_count). extra_row_count is the number + of rows that would need to be deleted to leave one row per pmid.""" + cur.execute( + "SELECT COUNT(*) AS groups, COALESCE(SUM(c - 1), 0) AS extras FROM (" + " SELECT COUNT(*) AS c FROM reporting_abstracts GROUP BY pmid HAVING c > 1" + ") d" + ) + r = cur.fetchone() + return r["groups"], r["extras"] + + +def backup_duplicate_extras(cur, backup_table): + """Insert into the backup table every duplicate row except the MIN(id) + keeper for each pmid. Returns the number of rows backed up.""" + cols = writable_columns(cur) + col_list = ", ".join(f"`{c}`" for c in cols) + select_list = ", ".join(f"ra.`{c}`" for c in cols) + cur.execute( + f"INSERT INTO `{backup_table}` ({col_list}) " + f"SELECT {select_list} FROM reporting_abstracts ra " + "JOIN (" + " SELECT pmid, MIN(id) AS keep_id FROM reporting_abstracts " + " GROUP BY pmid HAVING COUNT(*) > 1" + ") k ON k.pmid = ra.pmid AND ra.id <> k.keep_id" + ) + return cur.rowcount + + +def delete_duplicate_extras(cur): + """Delete every duplicate row except the MIN(id) keeper for each pmid. + Returns the number of rows deleted.""" + cur.execute( + "DELETE ra FROM reporting_abstracts ra " + "JOIN (" + " SELECT pmid, MIN(id) AS keep_id FROM reporting_abstracts " + " GROUP BY pmid HAVING COUNT(*) > 1" + ") k ON k.pmid = ra.pmid AND ra.id <> k.keep_id" + ) + return cur.rowcount + + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--audit-csv", default=DEFAULT_AUDIT_CSV, + help=f"Audit CSV path (default {DEFAULT_AUDIT_CSV})") + parser.add_argument("--apply", action="store_true", + help="Perform the delete. Without this flag, dry-run only.") + parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, + help=f"PMIDs per statement (default {DEFAULT_BATCH_SIZE})") + parser.add_argument("--backup-table", default=None, + help="Backup table name (default: reporting_abstracts_corrupt_backup_)") + args = parser.parse_args() + + pmids = read_invalid_pmids(args.audit_csv) + logger.info(f"Read {len(pmids):,} invalid PMIDs from {args.audit_csv}") + if not pmids: + logger.info("Nothing to repair.") + return + + backup_table = args.backup_table or ( + f"reporting_abstracts_corrupt_backup_" + f"{datetime.datetime.now():%Y%m%d_%H%M%S}" + ) + if not SAFE_IDENT.match(backup_table): + logger.error(f"Refusing unsafe backup-table identifier: {backup_table!r}") + sys.exit(1) + + conn = connect_mysql() + try: + with conn.cursor() as cur: + cur.execute("SELECT COUNT(*) AS c FROM reporting_abstracts") + before_total = cur.fetchone()["c"] + + matching = count_matching(cur, pmids) + logger.info( + f"reporting_abstracts: {before_total:,} rows total; " + f"{matching:,} rows match the invalid-PMID list." + ) + + if matching > len(pmids): + logger.info( + f"Live matches ({matching:,}) > unique PMIDs ({len(pmids):,}): " + f"{matching - len(pmids):,} of the audited PMIDs have multiple " + "rows in the live table (all of which will be deleted by the IN clause)." + ) + elif matching < len(pmids): + logger.warning( + f"Live matches ({matching:,}) < unique PMIDs ({len(pmids):,}): " + f"{len(pmids) - matching:,} audited PMIDs no longer present " + "(already deleted or table changed). Proceeding with what is live." + ) + + dupe_groups, dupe_extras = count_duplicate_extras(cur) + logger.info( + f"Duplicate-pmid groups: {dupe_groups:,} " + f"({dupe_extras:,} extra rows would be deduped after the corruption delete)." + ) + + if not args.apply: + cur.execute( + "SELECT pmid, LENGTH(abstract) AS db_len FROM reporting_abstracts " + "WHERE LENGTH(abstract) >= 4000 ORDER BY LENGTH(abstract) DESC LIMIT 3" + ) + samples = cur.fetchall() + logger.info("Sample of longest current rows (pre-repair):") + for s in samples: + logger.info(f" pmid={s['pmid']:>9} db_len={s['db_len']}") + logger.info(f"Would back up to: `{backup_table}`") + logger.info( + f"Would delete {matching:,} corrupted rows + dedupe " + f"{dupe_extras:,} duplicate-extras (keep MIN(id) per pmid)." + ) + logger.info("DRY RUN -- no changes made. Re-run with --apply to perform the repair.") + return + + logger.info(f"Creating backup table `{backup_table}` ...") + backed_up = backup_rows(cur, pmids, backup_table, args.batch_size) + logger.info(f"Backed up {backed_up:,} rows to `{backup_table}`.") + if backed_up != matching: + logger.error( + f"Backup row count {backed_up:,} != expected {matching:,}. Aborting." + ) + sys.exit(1) + + logger.info("Deleting corrupted rows from reporting_abstracts ...") + deleted = delete_rows(cur, pmids, args.batch_size) + + cur.execute("SELECT COUNT(*) AS c FROM reporting_abstracts") + after_total = cur.fetchone()["c"] + logger.info( + f"Deleted {deleted:,} rows; reporting_abstracts now has " + f"{after_total:,} rows (was {before_total:,})." + ) + if before_total - after_total != deleted: + logger.error( + f"Row-count delta mismatch: before-after={before_total - after_total}, " + f"deleted={deleted}. Backup table `{backup_table}` is intact." + ) + sys.exit(1) + + cur.execute( + "SELECT COUNT(*) AS c FROM reporting_abstracts WHERE LENGTH(abstract) >= 4000" + ) + long_remaining = cur.fetchone()["c"] + logger.info( + f"Rows with LENGTH(abstract) >= 4000 remaining: {long_remaining:,} " + "(should approximately equal the CLEAN count from the audit)." + ) + + cur.execute( + "SELECT COUNT(*) AS c FROM reporting_abstracts WHERE LENGTH(abstract) >= 60000" + ) + cap_remaining = cur.fetchone()["c"] + logger.info( + f"Rows at/above 60K (BLOB-cap region) remaining: {cap_remaining:,} " + "(should be 0 if repair caught all corruption)." + ) + + dupe_groups_after, dupe_extras_after = count_duplicate_extras(cur) + if dupe_extras_after > 0: + logger.info( + f"Phase 2: deduping {dupe_extras_after:,} extra rows across " + f"{dupe_groups_after:,} pmid groups (keeping MIN(id) per pmid)..." + ) + backed_up_dupes = backup_duplicate_extras(cur, backup_table) + logger.info(f" ... backed up {backed_up_dupes:,} duplicate rows to `{backup_table}`.") + if backed_up_dupes != dupe_extras_after: + logger.error( + f"Dedup backup count {backed_up_dupes:,} != expected {dupe_extras_after:,}. " + "Aborting before delete." + ) + sys.exit(1) + deleted_dupes = delete_duplicate_extras(cur) + logger.info(f" ... deleted {deleted_dupes:,} duplicate rows.") + if deleted_dupes != dupe_extras_after: + logger.error( + f"Dedup delete count {deleted_dupes:,} != expected {dupe_extras_after:,}. " + f"Backup table `{backup_table}` is intact." + ) + sys.exit(1) + else: + logger.info("Phase 2: no duplicates to dedupe.") + + dupes = find_duplicate_pmids(cur) + if dupes: + logger.error( + f"{len(dupes)} duplicate pmid(s) still present after dedup (sample): " + f"{[(d['pmid'], d['c']) for d in dupes]}" + ) + sys.exit(1) + else: + logger.info( + "No duplicate pmids remain. Safe to apply " + "setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql." + ) + finally: + conn.close() + + +if __name__ == "__main__": + main() diff --git a/update/retrieveNIH.py b/update/retrieveNIH.py index ae7f01d..9b39d72 100644 --- a/update/retrieveNIH.py +++ b/update/retrieveNIH.py @@ -138,22 +138,23 @@ def write_records_to_csv(records, csv_files): nih_writer.writerow(nih_record) nih_count += 1 - citing_pmid = get_dict_value(record, "pmid") + queried_pmid = get_dict_value(record, "pmid") - # Write to analysis_nih_cites + # iCite "cited_by" = articles that cite queried_pmid; "references" = articles queried_pmid cites. + # CSV column order matches LOAD DATA columns: (cited_pmid, citing_pmid). if record.get("cited_by"): - for cited_by in record["cited_by"]: - cites_writer.writerow([cited_by, citing_pmid]) + for citing in record["cited_by"]: + cites_writer.writerow([queried_pmid, citing]) cites_count += 1 if record.get("references"): - for ref in record["references"]: - cites_writer.writerow([ref, citing_pmid]) + for cited in record["references"]: + cites_writer.writerow([cited, queried_pmid]) cites_count += 1 # Write to analysis_nih_cites_clin if record.get("cited_by_clin"): - for cited_by_clin in record["cited_by_clin"]: - cites_clin_writer.writerow([cited_by_clin, citing_pmid]) + for citing_clin in record["cited_by_clin"]: + cites_clin_writer.writerow([queried_pmid, citing_clin]) cites_clin_count +=1 except Exception as e: @@ -207,6 +208,14 @@ def create_staging_tables(mysql_cursor, tables): mysql_cursor.execute(f"CREATE TABLE {staging_table} LIKE {table_name}") mysql_cursor.execute(f"ALTER TABLE {staging_table} MODIFY COLUMN id int(11) NOT NULL AUTO_INCREMENT") logger.info(f"Created staging table: {staging_table}") + # Add unique constraint on pmid for analysis_nih_new to prevent duplicates + try: + mysql_cursor.execute( + "ALTER TABLE analysis_nih_new ADD UNIQUE KEY uk_pmid (pmid)" + ) + logger.info("Added UNIQUE constraint on pmid for analysis_nih_new") + except Exception as e: + logger.warning(f"Could not add UNIQUE constraint (may already exist): {e}") def atomic_table_swap(mysql_db, mysql_cursor, tables): """ @@ -274,6 +283,7 @@ def validate_data(mysql_cursor, staging_table, production_table, min_rows=100, m Validate staging table has sufficient data before swap. - Must have at least min_rows - Must have at least min_percentage of production table's row count + - Detects duplicate pmids in production (corruption) and uses unique count instead """ mysql_cursor.execute(f"SELECT COUNT(*) as cnt FROM {staging_table}") staging_count = mysql_cursor.fetchone()['cnt'] @@ -281,8 +291,19 @@ def validate_data(mysql_cursor, staging_table, production_table, min_rows=100, m mysql_cursor.execute(f"SELECT COUNT(*) as cnt FROM {production_table}") production_count = mysql_cursor.fetchone()['cnt'] + # Check for duplicates in production (corruption detection) + mysql_cursor.execute(f"SELECT COUNT(DISTINCT pmid) as cnt FROM {production_table}") + unique_production = mysql_cursor.fetchone()['cnt'] + logger.info(f"Validation: {staging_table} has {staging_count} rows, " - f"{production_table} has {production_count} rows") + f"{production_table} has {production_count} rows " + f"({unique_production} unique pmids)") + + if production_count != unique_production: + logger.warning(f"CORRUPTION DETECTED: {production_table} has " + f"{production_count - unique_production} duplicate rows. " + f"Using unique count ({unique_production}) for validation.") + production_count = unique_production # Use deduped count for comparison # Check minimum rows if staging_count < min_rows: @@ -303,6 +324,21 @@ def validate_data(mysql_cursor, staging_table, production_table, min_rows=100, m logger.info(f"Validation PASSED for {staging_table}") return True +def check_production_integrity(mysql_cursor, table_name, key_column='pmid'): + """Check if production table has duplicate key values (corruption indicator).""" + mysql_cursor.execute(f""" + SELECT COUNT(*) as total_rows, COUNT(DISTINCT {key_column}) as unique_keys + FROM {table_name} + """) + result = mysql_cursor.fetchone() + total = result['total_rows'] + unique = result['unique_keys'] + if total != unique: + logger.warning(f"INTEGRITY CHECK: {table_name} has {total} rows but only " + f"{unique} unique {key_column} values ({total - unique} duplicates)") + return False + return True + ######### if __name__ == '__main__': diff --git a/update/retrieveReporter.py b/update/retrieveReporter.py new file mode 100644 index 0000000..36c8e88 --- /dev/null +++ b/update/retrieveReporter.py @@ -0,0 +1,443 @@ +# retrieveReporter.py +# +# Pulls grant metadata and pub-grant linkages from NIH RePORTER +# (https://api.reporter.nih.gov/v2/) and reconciles them against the +# ReCiter-derived person_article_grant table. +# +# Two API loops: +# 1. POST /projects/search filtered by WCM org name → grant_reporter_project +# 2. POST /publications/search keyed by appl_ids from step 1 → grant_reporter_link +# +# Then a SQL reconciliation step populates grant_provenance, the long-lived +# (person, pmid, grant)-keyed audit log that survives the nightly truncate- +# reload of person_article_grant. See setup/alter_add_reporter_fields_v1.2.sql +# for the full design rationale. +# +# Why we filter by org_name rather than fetching everything: +# RePORTER returns ~thousands of WCM-attributed projects. Pulling the full +# corpus would require partitioning by FY (offset cap is 9,999) and gives +# no benefit for our use case. Subaward caveat: WCM-as-sub may not appear +# under this org filter — accepted as a false-negative tradeoff to keep +# false positives near zero. + +import os +import sys +import csv +import time +import random +import re +import logging +import faulthandler +import signal +import requests +import pymysql.cursors +import pymysql.err + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('retrieveReporter.log', mode='w'), + logging.StreamHandler(sys.stdout), + ], +) +logger = logging.getLogger(__name__) + +faulthandler.enable(file=sys.stderr, all_threads=True) +faulthandler.register(signal.SIGUSR1, file=sys.stderr, all_threads=True) + +REPORTER_BASE_URL = 'https://api.reporter.nih.gov/v2' +WCM_ORG_NAME = 'WEILL MEDICAL COLL OF CORNELL UNIV' +PAGE_LIMIT = 500 +OFFSET_CAP = 9999 +REQUEST_INTERVAL_SEC = 1.0 # NIH guidance: 1 req/sec +PUBS_BATCH_SIZE = 50 # appl_ids per /publications/search call + +# core_project_num pattern, e.g. "R01DK127777", "U01AI189285", "K23MH112873". +# Prefix is 1-3 alphanumeric (activity code) + 2 letters (IC) + 5-7 digits. +CORE_PROJECT_RE = re.compile(r'\b([A-Z]\d{1,2}[A-Z]{2}\d{5,7})\b') + + +def connect_db(max_retries=5, backoff_factor=1): + username = os.environ['DB_USERNAME'] + password = os.environ['DB_PASSWORD'] + hostname = os.environ['DB_HOST'] + database = os.environ['DB_NAME'] + for retry in range(max_retries): + try: + conn = pymysql.connect( + user=username, + password=password, + database=database, + host=hostname, + local_infile=True, + cursorclass=pymysql.cursors.DictCursor, + ) + logger.info('Connected to database %s on %s', database, hostname) + return conn + except pymysql.err.MySQLError as err: + logger.error('DB connect attempt %d failed: %s', retry + 1, err) + time.sleep(backoff_factor * (2 ** retry) + random.uniform(0, 1)) + raise RuntimeError('Could not connect to database after retries') + + +def post_with_retry(url, payload, max_retries=5, backoff_factor=1): + """POST with exponential backoff. Honors NIH's 1 req/sec rate limit + by sleeping between successful calls in the caller.""" + for retry in range(max_retries): + try: + r = requests.post(url, json=payload, timeout=(10, 90)) + if r.status_code == 429: + wait = backoff_factor * (2 ** retry) + random.uniform(0, 5) + logger.warning('429 from RePORTER; sleeping %.1fs', wait) + time.sleep(wait) + continue + r.raise_for_status() + return r.json() + except requests.exceptions.RequestException as e: + wait = backoff_factor * (2 ** retry) + random.uniform(0, 1) + logger.error('RePORTER request failed (attempt %d): %s; sleep %.1fs', + retry + 1, e, wait) + time.sleep(wait) + raise RuntimeError(f'RePORTER request failed after {max_retries} retries: {url}') + + +def _fetch_projects_page(criteria): + """Yield project dicts for a single criteria block. Caller must ensure + the result set fits under OFFSET_CAP; we log and stop if it doesn't.""" + url = f'{REPORTER_BASE_URL}/projects/search' + offset = 0 + while offset <= OFFSET_CAP: + payload = { + 'criteria': criteria, + 'limit': PAGE_LIMIT, + 'offset': offset, + } + data = post_with_retry(url, payload) + results = data.get('results', []) or [] + if not results: + return + for row in results: + yield row + meta = data.get('meta', {}) or {} + total = meta.get('total', 0) + offset += PAGE_LIMIT + if offset >= total: + return + if offset > OFFSET_CAP: + logger.warning( + 'Result set has %d records but offset cap is %d; truncating. ' + 'Caller should partition further (e.g. by activity_code).', + total, OFFSET_CAP) + return + time.sleep(REQUEST_INTERVAL_SEC) + + +def fetch_projects(base_criteria): + """Yield project dicts, partitioning by fiscal year when needed to stay + under the offset cap. WCM has ~15K projects historically, which exceeds + the 9,999 offset limit on a single criteria block. + + Strategy: probe total once with the base criteria. If under the cap, + return all in one stream. Otherwise iterate fiscal years from the + earliest NIH grant year (1985) through next year, requesting + fiscal_years=[FY] for each.""" + probe = post_with_retry( + f'{REPORTER_BASE_URL}/projects/search', + {'criteria': base_criteria, 'limit': 1, 'offset': 0}, + ) + total = (probe.get('meta', {}) or {}).get('total', 0) + logger.info('RePORTER /projects/search reports %d total matches for base criteria', total) + + if total <= OFFSET_CAP: + yield from _fetch_projects_page(base_criteria) + return + + import datetime + end_fy = datetime.date.today().year + 1 + for fy in range(1985, end_fy + 1): + criteria = dict(base_criteria) + criteria['fiscal_years'] = [fy] + yielded_this_fy = 0 + for row in _fetch_projects_page(criteria): + yielded_this_fy += 1 + yield row + if yielded_this_fy: + logger.info('FY %d: yielded %d projects', fy, yielded_this_fy) + time.sleep(REQUEST_INTERVAL_SEC) + + +def fetch_publications_for_appl_ids(appl_ids): + """Yield (pmid, appl_id, core_project_num) tuples from /publications/search + in batches of PUBS_BATCH_SIZE.""" + url = f'{REPORTER_BASE_URL}/publications/search' + appl_ids = list({int(x) for x in appl_ids if x is not None}) + for i in range(0, len(appl_ids), PUBS_BATCH_SIZE): + batch = appl_ids[i:i + PUBS_BATCH_SIZE] + offset = 0 + while offset <= OFFSET_CAP: + payload = { + 'criteria': {'appl_ids': batch}, + 'limit': PAGE_LIMIT, + 'offset': offset, + } + data = post_with_retry(url, payload) + results = data.get('results', []) or [] + if not results: + break + for row in results: + pmid = row.get('pmid') + appl_id = row.get('applid') or row.get('appl_id') + core = row.get('coreproject') or row.get('core_project_num') + if pmid and appl_id: + yield int(pmid), int(appl_id), core + meta = data.get('meta', {}) or {} + total = meta.get('total', 0) + offset += PAGE_LIMIT + if offset >= total: + break + time.sleep(REQUEST_INTERVAL_SEC) + time.sleep(REQUEST_INTERVAL_SEC) + + +def reload_table(conn, table, rows, columns): + """Truncate `table` and insert `rows` (list of tuples matching `columns`). + Used for the staging tables grant_reporter_project and grant_reporter_link. + grant_provenance is upserted, not reloaded.""" + placeholders = ', '.join(['%s'] * len(columns)) + col_list = ', '.join(f'`{c}`' for c in columns) + cur = conn.cursor() + cur.execute(f'TRUNCATE TABLE `{table}`') + if rows: + sql = f'INSERT INTO `{table}` ({col_list}) VALUES ({placeholders})' + cur.executemany(sql, rows) + conn.commit() + cur.execute(f'SELECT COUNT(*) AS c FROM `{table}`') + count = cur.fetchone()['c'] + logger.info('Reloaded %s: %d rows', table, count) + + +def normalize_grant_string(raw): + """Extract a core project number (e.g. R01DK127777) from a free-text + NIH grant string. Returns None if no match — caller decides whether to + fall back to the raw string.""" + if not raw: + return None + upper = re.sub(r'[\s\-\/]', '', raw.upper()) + m = CORE_PROJECT_RE.search(upper) + return m.group(1) if m else None + + +def reconcile_provenance(conn): + """Populate grant_provenance from person_article_grant and grant_reporter_link. + + Bulk pattern: each side does a single INSERT...SELECT with ON DUPLICATE + KEY UPDATE so we make one round trip per side instead of one per row. + First_seen timestamps stick because they're only in the INSERT clause, + not the UPDATE clause.""" + import tempfile + cur = conn.cursor() + + # ----- (1) reciterdb side ----- + # Normalization (free-text articleGrant → core_project_num) happens in + # Python, so we stage the normalized rows in a temp table first via + # LOAD DATA LOCAL INFILE, then do a single bulk upsert. + logger.info('Reading person_article_grant for reconciliation') + cur.execute(""" + SELECT personIdentifier, pmid, articleGrant + FROM person_article_grant + WHERE personIdentifier IS NOT NULL + AND pmid > 0 + AND articleGrant IS NOT NULL + AND articleGrant <> '' + """) + pag_rows = cur.fetchall() + logger.info('person_article_grant rows considered: %d', len(pag_rows)) + + # Normalize + dedupe in Python (the temp table's PK enforces uniqueness + # but deduping here avoids LOAD DATA INFILE warnings on duplicate rows). + seen = set() + normalized = [] + for row in pag_rows: + n = normalize_grant_string(row['articleGrant']) + if not n: + # Non-NIH fallback: sanitize control chars (CSV uses TAB delim) + n = re.sub(r'[\t\n\r]', ' ', row['articleGrant'])[:64] + key = (row['personIdentifier'], row['pmid'], n) + if key in seen: + continue + seen.add(key) + normalized.append(key) + logger.info('Normalized + deduped to %d distinct (person, pmid, grant) rows', + len(normalized)) + + csv_file = tempfile.NamedTemporaryFile( + delete=False, mode='w', suffix='.csv', newline='', encoding='utf-8') + try: + writer = csv.writer(csv_file, delimiter='\t', lineterminator='\n', + quoting=csv.QUOTE_NONE, escapechar='\\') + for r in normalized: + writer.writerow(r) + csv_file.close() + + cur.execute("DROP TEMPORARY TABLE IF EXISTS _reciter_grant_staging") + cur.execute(""" + CREATE TEMPORARY TABLE _reciter_grant_staging ( + personIdentifier VARCHAR(128) NOT NULL, + pmid INT NOT NULL, + core_project_num VARCHAR(64) NOT NULL, + PRIMARY KEY (personIdentifier, pmid, core_project_num) + ) ENGINE=InnoDB + """) + load_sql = ( + f"LOAD DATA LOCAL INFILE '{csv_file.name}' " + "INTO TABLE _reciter_grant_staging " + "FIELDS TERMINATED BY '\\t' LINES TERMINATED BY '\\n' " + "(personIdentifier, pmid, core_project_num)" + ) + cur.execute(load_sql) + cur.execute("SELECT COUNT(*) AS c FROM _reciter_grant_staging") + logger.info('Loaded %d rows into reciterdb staging table', + cur.fetchone()['c']) + + cur.execute(""" + INSERT INTO grant_provenance + (personIdentifier, pmid, core_project_num, + source_reciterdb, reciterdb_first_seen, last_verified) + SELECT personIdentifier, pmid, core_project_num, + 1, NOW(), NOW() + FROM _reciter_grant_staging + ON DUPLICATE KEY UPDATE + source_reciterdb = 1, + last_verified = NOW() + """) + # rowcount on bulk upsert is "1 per insert + 2 per update" in MariaDB + # — informative, not exact + logger.info('Reciterdb-side bulk upsert: %d rowcount', cur.rowcount) + cur.execute("DROP TEMPORARY TABLE _reciter_grant_staging") + conn.commit() + finally: + try: + os.unlink(csv_file.name) + except OSError: + pass + + # ----- (2) RePORTER side ----- + # Pure SQL — no Python iteration. The JOIN to person_article enforces + # the false-positive guard (only ACCEPTED PMIDs credit a person). + # GROUP BY collapses cases where one (person, pmid, core_project) has + # multiple appl_ids (different fiscal years of the same grant); MAX + # picks the most recent appl_id deterministically. + logger.info('Running RePORTER-side bulk upsert') + cur.execute(""" + INSERT INTO grant_provenance + (personIdentifier, pmid, core_project_num, appl_id, + source_reporter, reporter_first_seen, last_verified) + SELECT pa.personIdentifier, grl.pmid, grl.core_project_num, + MAX(grl.appl_id), 1, NOW(), NOW() + FROM grant_reporter_link grl + JOIN person_article pa + ON pa.pmid = grl.pmid + AND pa.userAssertion = 'ACCEPTED' + WHERE grl.core_project_num IS NOT NULL + GROUP BY pa.personIdentifier, grl.pmid, grl.core_project_num + ON DUPLICATE KEY UPDATE + source_reporter = 1, + appl_id = COALESCE(VALUES(appl_id), grant_provenance.appl_id), + last_verified = NOW() + """) + logger.info('RePORTER-side bulk upsert: %d rowcount', cur.rowcount) + conn.commit() + + # Summary + cur.execute("SELECT COUNT(*) AS c FROM grant_provenance") + total = cur.fetchone()['c'] + cur.execute("SELECT COUNT(*) AS c FROM grant_provenance WHERE source_reporter = 1 AND source_reciterdb = 1") + both = cur.fetchone()['c'] + cur.execute("SELECT COUNT(*) AS c FROM grant_provenance WHERE source_reporter = 1 AND source_reciterdb = 0") + rep_only = cur.fetchone()['c'] + cur.execute("SELECT COUNT(*) AS c FROM grant_provenance WHERE source_reporter = 0 AND source_reciterdb = 1") + reciter_only = cur.fetchone()['c'] + logger.info('Provenance totals: %d rows | both=%d | reporter-only=%d | reciter-only=%d', + total, both, rep_only, reciter_only) + + +def main(): + org_name = os.environ.get('REPORTER_ORG_NAME', WCM_ORG_NAME) + logger.info('Starting RePORTER ETL for org: %s', org_name) + + conn = connect_db() + + # ----- Loop A: projects ----- + # No include_fields — the API expects CamelCase there ('ApplId') but + # response field names are snake_case ('appl_id'). Easier to take all + # fields back than maintain two name conventions. + project_rows = [] + appl_ids = [] + seen_appl_ids = set() + for proj in fetch_projects(base_criteria={'org_names': [org_name]}): + appl_id = proj.get('appl_id') + if not appl_id: + continue + # RePORTER returns a project under every fiscal year it was active, so + # the FY-partitioned fetch (used when the corpus exceeds the 9,999 + # offset cap) yields the same appl_id in multiple pages. appl_id is + # grant_reporter_project's PRIMARY KEY, so dedup before the reload — + # mirrors the seen_pairs guard in the publications loop below. + if appl_id in seen_appl_ids: + continue + seen_appl_ids.add(appl_id) + appl_ids.append(appl_id) + org = (proj.get('organization') or {}).get('org_name') + project_rows.append(( + int(appl_id), + proj.get('core_project_num'), + (proj.get('project_title') or '')[:512], + (org or '')[:255], + proj.get('fiscal_year'), + proj.get('activity_code'), + proj.get('project_start_date'), + proj.get('project_end_date'), + proj.get('abstract_text'), + # NIH-curated keyword vocabularies, stored raw (issue #291). + # 'terms' is angle-bracket-wrapped (); 'pref_terms' is + # semicolon-delimited. Parsed downstream by the SPS funding ETL. + proj.get('terms'), + proj.get('pref_terms'), + )) + logger.info('Fetched %d RePORTER projects', len(project_rows)) + reload_table( + conn, + 'grant_reporter_project', + project_rows, + ['appl_id', 'core_project_num', 'project_title', 'org_name', + 'fiscal_year', 'activity_code', 'project_start_date', + 'project_end_date', 'abstract_text', 'project_terms', 'pref_terms'], + ) + + # ----- Loop B: publications ----- + link_rows = [] + seen_pairs = set() + for pmid, appl_id, core in fetch_publications_for_appl_ids(appl_ids): + key = (pmid, appl_id) + if key in seen_pairs: + continue + seen_pairs.add(key) + link_rows.append((pmid, appl_id, core)) + logger.info('Fetched %d unique (pmid, appl_id) pairs', len(link_rows)) + reload_table( + conn, + 'grant_reporter_link', + link_rows, + ['pmid', 'appl_id', 'core_project_num'], + ) + + # ----- Reconciliation ----- + reconcile_provenance(conn) + + conn.close() + logger.info('RePORTER ETL complete') + + +if __name__ == '__main__': + main() diff --git a/update/run_all.py b/update/run_all.py index e53e2fb..6ebe245 100644 --- a/update/run_all.py +++ b/update/run_all.py @@ -112,7 +112,8 @@ def main(): scripts = [ ("executeFeatureGenerator", "python3 executeFeatureGenerator.py"), ("retrieveArticles", "python3 retrieveArticles.py"), - ("retrieveNIH", "python3 retrieveNIH.py"), + ("retrieveNIH", "python3 retrieveNIH.py"), + ("retrieveReporter", "python3 retrieveReporter.py"), ("nightlyIndexing", "bash run_nightly_indexing.sh"), ("abstractImport", "python3 abstractImport.py"), ("conflictsImport", "python3 conflictsImport.py") diff --git a/update/temp/.gitkeep b/update/temp/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/update/updateReciterDB.py b/update/updateReciterDB.py index 6aa3a7f..94d0e55 100644 --- a/update/updateReciterDB.py +++ b/update/updateReciterDB.py @@ -303,12 +303,14 @@ def main(truncate_tables=True, skip_person_temp=False): "pubmedTargetAuthorInstitutionalAffiliationMatchTypeScore", "scopusNonTargetAuthorInstitutionalAffiliationSource", "scopusNonTargetAuthorInstitutionalAffiliationScore", - "datePublicationAddedToEntrez", "doi", + "datePublicationAddedToEntrez", "datePublicationAddedToPMC", "doi", "issn", "issue", "journalTitleISOabbreviation", "pages", "timesCited", "volume", "feedbackScoreCites", "feedbackScoreCoAuthorName", "feedbackScoreEmail", "feedbackScoreInstitution", "feedbackScoreJournal", "feedbackScoreJournalSubField", "feedbackScoreKeyword", "feedbackScoreOrcid", "feedbackScoreOrcidCoAuthor", "feedbackScoreOrganization", "feedbackScoreTargetAuthorName", "feedbackScoreYear", + "feedbackScoreTextSimilarity", "feedbackScoreJournalTitleSimilarity", + "feedbackScoreBibliographicCoupling", "totalArticleScoreStandardized", "totalArticleScoreNonStandardized" ], 'person_article_author': [