From 84fd32f79602d84069f07a4debb60aca5a085dbb Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Thu, 5 Mar 2026 12:09:20 -0500
Subject: [PATCH 01/19] Fix validation blocking table swap when production has
 duplicate pmids

The analysis_nih table was loaded with ~527K rows (2x duplicates) on
Dec 18, 2025. Since then, every nightly run retrieves the correct ~267K
rows but validation rejects the swap (267K/527K = 50.7% < 80% threshold).

Changes:
- validate_data() now detects duplicate pmids in production and uses the
  unique count for percentage comparison, allowing the swap to self-heal
- create_staging_tables() adds a UNIQUE constraint on pmid for
  analysis_nih_new to prevent future duplicate inserts
- Added check_production_integrity() utility for diagnostic use
- Schema: changed analysis_nih.idx_pmid from KEY to UNIQUE KEY

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 setup/createDatabaseTableReciterDb.sql |  2 +-
 update/retrieveNIH.py                  | 37 +++++++++++++++++++++++++-
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/setup/createDatabaseTableReciterDb.sql b/setup/createDatabaseTableReciterDb.sql
index e94efd2..13d04d8 100644
--- a/setup/createDatabaseTableReciterDb.sql
+++ b/setup/createDatabaseTableReciterDb.sql
@@ -260,7 +260,7 @@ CREATE TABLE IF NOT EXISTS `analysis_nih` (
   `x_coord` float(5,4) DEFAULT NULL,
   `y_coord` float(5,4) DEFAULT NULL,
   PRIMARY KEY (`id`),
-  KEY `idx_pmid` (`pmid`) USING BTREE
+  UNIQUE KEY `idx_pmid` (`pmid`) USING BTREE
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
 
 CREATE TABLE IF NOT EXISTS `analysis_nih_cites` (
diff --git a/update/retrieveNIH.py b/update/retrieveNIH.py
index ae7f01d..97b40bd 100644
--- a/update/retrieveNIH.py
+++ b/update/retrieveNIH.py
@@ -207,6 +207,14 @@ def create_staging_tables(mysql_cursor, tables):
         mysql_cursor.execute(f"CREATE TABLE {staging_table} LIKE {table_name}")
         mysql_cursor.execute(f"ALTER TABLE {staging_table} MODIFY COLUMN id int(11) NOT NULL AUTO_INCREMENT")
         logger.info(f"Created staging table: {staging_table}")
+    # Add unique constraint on pmid for analysis_nih_new to prevent duplicates
+    try:
+        mysql_cursor.execute(
+            "ALTER TABLE analysis_nih_new ADD UNIQUE KEY uk_pmid (pmid)"
+        )
+        logger.info("Added UNIQUE constraint on pmid for analysis_nih_new")
+    except Exception as e:
+        logger.warning(f"Could not add UNIQUE constraint (may already exist): {e}")
 
 def atomic_table_swap(mysql_db, mysql_cursor, tables):
     """
@@ -274,6 +282,7 @@ def validate_data(mysql_cursor, staging_table, production_table, min_rows=100, m
     Validate staging table has sufficient data before swap.
     - Must have at least min_rows
     - Must have at least min_percentage of production table's row count
+    - Detects duplicate pmids in production (corruption) and uses unique count instead
     """
     mysql_cursor.execute(f"SELECT COUNT(*) as cnt FROM {staging_table}")
     staging_count = mysql_cursor.fetchone()['cnt']
@@ -281,8 +290,19 @@ def validate_data(mysql_cursor, staging_table, production_table, min_rows=100, m
     mysql_cursor.execute(f"SELECT COUNT(*) as cnt FROM {production_table}")
     production_count = mysql_cursor.fetchone()['cnt']
 
+    # Check for duplicates in production (corruption detection)
+    mysql_cursor.execute(f"SELECT COUNT(DISTINCT pmid) as cnt FROM {production_table}")
+    unique_production = mysql_cursor.fetchone()['cnt']
+
     logger.info(f"Validation: {staging_table} has {staging_count} rows, "
-                f"{production_table} has {production_count} rows")
+                f"{production_table} has {production_count} rows "
+                f"({unique_production} unique pmids)")
+
+    if production_count != unique_production:
+        logger.warning(f"CORRUPTION DETECTED: {production_table} has "
+                       f"{production_count - unique_production} duplicate rows. "
+                       f"Using unique count ({unique_production}) for validation.")
+        production_count = unique_production  # Use deduped count for comparison
 
     # Check minimum rows
     if staging_count < min_rows:
@@ -303,6 +323,21 @@ def validate_data(mysql_cursor, staging_table, production_table, min_rows=100, m
     logger.info(f"Validation PASSED for {staging_table}")
     return True
 
+def check_production_integrity(mysql_cursor, table_name, key_column='pmid'):
+    """Check if production table has duplicate key values (corruption indicator)."""
+    mysql_cursor.execute(f"""
+        SELECT COUNT(*) as total_rows, COUNT(DISTINCT {key_column}) as unique_keys
+        FROM {table_name}
+    """)
+    result = mysql_cursor.fetchone()
+    total = result['total_rows']
+    unique = result['unique_keys']
+    if total != unique:
+        logger.warning(f"INTEGRITY CHECK: {table_name} has {total} rows but only "
+                       f"{unique} unique {key_column} values ({total - unique} duplicates)")
+        return False
+    return True
+
 #########
 
 if __name__ == '__main__':

From 4605fc2945355d5393cc198e2923e64ccbbb1664 Mon Sep 17 00:00:00 2001
From: paulalbert1 <paa2013@med.cornell.edu>
Date: Mon, 27 Mar 2023 08:59:27 -0400
Subject: [PATCH 02/19] Updated journal impact score inference

---
 setup/insertBaselineDataReciterDb.sql | 353 --------------------------
 setup/table_admin_roles.sql           |   9 +
 update/temp/.gitkeep                  |   0
 3 files changed, 9 insertions(+), 353 deletions(-)
 delete mode 100644 setup/insertBaselineDataReciterDb.sql
 create mode 100644 setup/table_admin_roles.sql
 delete mode 100644 update/temp/.gitkeep

diff --git a/setup/insertBaselineDataReciterDb.sql b/setup/insertBaselineDataReciterDb.sql
deleted file mode 100644
index 081b1c3..0000000
--- a/setup/insertBaselineDataReciterDb.sql
+++ /dev/null
@@ -1,353 +0,0 @@
-SET FOREIGN_KEY_CHECKS = 0;
-TRUNCATE `admin_roles`;
-
-LOCK TABLES `admin_roles` WRITE;
-INSERT INTO `admin_roles` (`roleID`, `roleLabel`) VALUES 
-  (1,'Superuser'),
-  (2,'Curator_All'),
-  (3,'Reporter_All'),
-  (4,'Curator_Self'),
-  (5,'Curator_Department'),
-  (6,'Curator_Department_Delegate');
-UNLOCK TABLES;
-
-TRUNCATE `analysis_special_characters`;
-
-LOCK TABLES `analysis_special_characters` WRITE;
-TRUNCATE `analysis_special_characters`;
-INSERT INTO `analysis_special_characters` (`id`, `specialCharacter`, `RTFescape`, `characterName`) VALUES 
-	(1,'͵','\\\'82','Low left single quote'),
-	(2,'ƒ','\\\'83','Florin'),
-	(3,'„','\\\'84','Low left double quote'),
-	(4,'…','\\\'85','Ellipsis'),
-	(5,'†','\\\'86','Dagger'),
-	(6,'‡','\\\'87','Double dagger'),
-	(7,'∘','\\\'88','Circumflex'),
-	(8,'‰','\\\'89','Permil'),
-	(9,'Š','\\\'8a','S-caron'),
-	(10,'‹','\\\'8b','Single left guillemet'),
-	(11,'Œ','\\\'8c','OE-ligature'),
-	(12,'Ž','\\\'8e','Z-caron'),
-	(13,'‘','\\\'91','Left single quote'),
-	(14,'’','\\\'92','Right single quote'),
-	(15,'“','\\\'93','Left double quote'),
-	(16,'”','\\\'94','Right double quote'),
-	(17,'•','\\\'95','Bullet'),
-	(18,'–','\\\'96','En dash'),
-	(19,'—','\\\'97','Em dash'),
-	(20,'~','\\\'98','Tilde'),
-	(21,'™','\\\'99','Trademark'),
-	(22,'š','\\\'9a','s-caron'),
-	(23,'›','\\\'9b','Single right guillemet'),
-	(24,'œ','\\\'9c','oe ligature'),
-	(25,'ž','\\\'9e','z-caron'),
-	(26,'Ÿ','\\\'9f','Y-diaeresis'),
-	(27,'¡','\\\'a1','Inverted exclamation point'),
-	(28,'¢','\\\'a2','Cent sign'),
-	(29,'£','\\\'a3','Pound sign'),
-	(30,'¤','\\\'a4','General currency sign'),
-	(31,'¥','\\\'a5','Yen sign'),
-	(32,'¦','\\\'a6','Broken vertical bar'),
-	(33,'§','\\\'a7','Section sign'),
-	(34,'¨','\\\'a8','Spacing diaeresis'),
-	(35,'©','\\\'a9','Copyright'),
-	(36,'ª','\\\'aa','Feminine ordinal'),
-	(37,'«','\\\'ab','Left angle quotes'),
-	(38,'¬','\\\'ac','Not sign'),
-	(39,'(-)','\\-','Soft hyphen'),
-	(40,'®','\\\'ae','Registered trademark'),
-	(41,'¯','\\\'af','Macron accent'),
-	(42,'°','\\\'b0','Degree sign'),
-	(43,'±','\\\'b1','Plus or minus sign'),
-	(44,'²','\\\'b2','Superscript 2'),
-	(45,'³','\\\'b3','Superscript 3'),
-	(46,'´','\\\'b4','Acute accent'),
-	(47,'µ','\\\'b5','Micro sign (Greek mu)'),
-	(48,'¶','\\\'b6','Paragraph sign'),
-	(49,'·','\\\'b7','Middle dot'),
-	(50,'¸','\\\'b8','Cedilla'),
-	(51,'¹','\\\'b9','Superscript 1'),
-	(52,'º','\\\'ba','Masculine ordinal'),
-	(53,'»','\\\'bb','Right angle quotes'),
-	(54,'¼','\\\'bc','One-fourth fraction'),
-	(55,'½','\\\'bd','One-half fraction'),
-	(56,'¾','\\\'be','Three-fourths fraction'),
-	(57,'¿','\\\'bf','Inverted question mark'),
-	(58,'À','\\\'c0','A-grave'),
-	(59,'Á','\\\'c1','A-acute'),
-	(60,'Â','\\\'c2','A-circumflex'),
-	(61,'Ã','\\\'c3','A-tilde'),
-	(62,'Ä','\\\'c4','A-diaeresis'),
-	(63,'Å','\\\'c5','A-ring'),
-	(64,'Æ','\\\'c6','AE-ligature'),
-	(65,'Ç','\\\'c7','C-cedilla'),
-	(66,'È','\\\'c8','E-grave'),
-	(67,'É','\\\'c9','E-acute'),
-	(68,'Ê','\\\'ca','E-circumflex'),
-	(69,'Ë','\\\'cb','E-diaeresis'),
-	(70,'Ì','\\\'cc','I-grave'),
-	(71,'Í','\\\'cd','I-acute'),
-	(72,'Î','\\\'ce','I-circumflex'),
-	(73,'Ï','\\\'cf','I-diaeresis'),
-	(74,'Ð','\\\'d0','Uppercase edh'),
-	(75,'Ñ','\\\'d1','N-tilde'),
-	(76,'Ò','\\\'d2','O-grave'),
-	(77,'Ó','\\\'d3','O-acute'),
-	(78,'Ô','\\\'d4','O-circumflex'),
-	(79,'Õ','\\\'d5','O-tilde'),
-	(80,'Ö','\\\'d6','O-diaeresis'),
-	(81,'×','\\\'d7','Multiply sign'),
-	(82,'Ø','\\\'d8','O-slash'),
-	(83,'Ù','\\\'d9','U-grave'),
-	(84,'Ú','\\\'da','U-acute'),
-	(85,'Û','\\\'db','U-circumflex'),
-	(86,'Ü','\\\'dc','U-diaeresis'),
-	(87,'Ý','\\\'dd','Y-acute'),
-	(88,'Þ','\\\'de','Uppercase thorn'),
-	(89,'ß','\\\'df','German ess-zed'),
-	(90,'à','\\\'e0','a-grave'),
-	(91,'á','\\\'e1','a-acute'),
-	(92,'â','\\\'e2','a-circumflex'),
-	(93,'ã','\\\'e3','a-tilde'),
-	(94,'ä','\\\'e4','a-diaeresis'),
-	(95,'å','\\\'e5','a-ring'),
-	(96,'æ','\\\'e6','ae-ligature'),
-	(97,'ç','\\\'e7','c-cedilla'),
-	(98,'è','\\\'e8','e-grave'),
-	(99,'é','\\\'e9','e-acute'),
-	(100,'ê','\\\'ea','e-circumflex'),
-	(101,'ë','\\\'eb','e-diaeresis'),
-	(102,'ì','\\\'ec','i-grave'),
-	(103,'í','\\\'ed','i-acute'),
-	(104,'î','\\\'ee','i-circumflex'),
-	(105,'ï','\\\'ef','i-diaeresis'),
-	(106,'ð','\\\'f0','Lowercase edh'),
-	(107,'ñ','\\\'f1','n-tilde'),
-	(108,'ò','\\\'f2','o-grave'),
-	(109,'ó','\\\'f3','o-acute'),
-	(110,'ô','\\\'f4','o-circumflex'),
-	(111,'õ','\\\'f5','o-tilde'),
-	(112,'ö','\\\'f6','o-diaeresis'),
-	(113,'÷','\\\'f7','Division sign'),
-	(114,'ø','\\\'f8','o-slash'),
-	(115,'ù','\\\'f9','u-grave'),
-	(116,'ú','\\\'fa','u-acute'),
-	(117,'û','\\\'fb','u-circumflex'),
-	(118,'ü','\\\'fc','u-diaeresis'),
-	(119,'ý','\\\'fd','y-acute'),
-	(120,'þ','\\\'fe','Lowercase thorn'),
-	(121,'ÿ','\\\'ff','y-diaeresis'),
-	(122,'č','\\u269 ',NULL),
-	(123,'ć','\\u263 ',NULL),
-	(124,'β','\\u946 ','beta'),
-	(125,'Α','\\u913 ','Alpha'),
-	(126,'Β','\\u914 ','Beta'),
-	(127,'Γ','\\u915 ','Gamma'),
-	(128,'Δ','\\u916 ','Delta'),
-	(129,'Ε','\\u917 ','Epsilon'),
-	(130,'Ζ','\\u918 ','Zeta'),
-	(131,'Η','\\u919 ','Eta'),
-	(132,'Θ','\\u920 ','Theta'),
-	(133,'Ι','\\u921 ','Iota'),
-	(134,'Κ','\\u922 ','Kappa'),
-	(135,'Λ','\\u923 ','Lambda'),
-	(136,'Μ','\\u924 ','Mu'),
-	(137,'Ν','\\u925 ','Nu'),
-	(138,'Ξ','\\u926 ','Xi'),
-	(139,'Ο','\\u927 ','Omicron'),
-	(140,'Π','\\u928 ','Pi'),
-	(141,'Ρ','\\u929 ','Rho'),
-	(142,'Σ','\\u931 ','Sigma'),
-	(143,'Τ','\\u932 ','Tau'),
-	(144,'Υ','\\u933 ','Upsilon'),
-	(145,'Φ','\\u934 ','Phi'),
-	(146,'Χ','\\u935 ','Chi'),
-	(147,'Ψ','\\u936 ','Psi'),
-	(148,'Ω','\\u937 ','Omega'),
-	(149,'α','\\u945 ','Alpha'),
-	(150,'β','\\u946 ','Beta'),
-	(151,'γ','\\u947 ','Gamma'),
-	(152,'δ','\\u948 ','Delta'),
-	(153,'ε','\\u949 ','Epsilon'),
-	(154,'ζ','\\u950 ','Zeta'),
-	(155,'η','\\u951 ','Eta'),
-	(156,'θ','\\u952 ','Theta'),
-	(157,'ι','\\u953 ','Iota'),
-	(158,'κ','\\u954 ','Kappa'),
-	(159,'λ','\\u955 ','Lambda'),
-	(160,'μ','\\u956 ','Mu'),
-	(161,'ν','\\u957 ','Nu'),
-	(162,'ξ','\\u958 ','Xi'),
-	(163,'ο','\\u959 ','Omicron'),
-	(164,'π','\\u960 ','Pi'),
-	(165,'ρ','\\u961 ','Rho'),
-	(166,'σ','\\u963 ','Sigma'),
-	(167,'ς','\\u962 ','Sigma'),
-	(168,'τ','\\u964 ','Tau'),
-	(169,'υ','\\u965 ','Upsilon'),
-	(170,'φ','\\u966 ','Phi'),
-	(171,'χ','\\u967 ','Chi'),
-	(172,'ψ','\\u968 ','Psi'),
-	(173,'ω','\\u969 ','Omega'),
-	(174,'®','\\\'ae','reserved'),
-	(175,'ü','\\\'fc','u umlaut'),
-	(176,'ö','\\\'f6','o umlaut'),
-	(177,'é','\\\'e9','accented e'),
-	(178,'ç','\\\'e7','french c'),
-	(179,'…','\\\'85','ellipsis'),
-	(180,'ó','\\\'f3','accented o'),
-	(181,'™','\\\'99','trademark'),
-	(182,'≤','\\u8804 ','less than or equal to'),
-	(183,'≥','\\u8805 ','greater than or equal to'),
-	(184,'à','\\\'e0','accented a'),
-	(185,'ï','\\\'ef','i umlaut'),
-	(186,'—','\\\'97','long dash'),
-	(187,'→','\\u8594 ','right arrow'),
-	(188,'←','\\u8592 ','left arrow'),
-	(189,'°','\\\'b0','degree'),
-	(190,'á','\\\'e1','accented a'),
-	(191,'†','\\\'86','cross'),
-	(192,'è','\\\'e8','accented e'),
-	(193,'ê','\\\'ea','weird e'),
-	(194,'ã','\\\'e3','a with tilde'),
-	(195,'ß','\\\'df','beta'),
-	(196,'ū','\\u363 ','u with tilde'),
-	(197,'‡','\\\'87','double dagger'),
-	(198,'©','\\\'a9','copyright'),
-	(199,'∆','\\u8710 ','delta'),
-	(200,'í','\\\'ed','accented i'),
-	(201,'’','\\\'92','apostrophe'),
-	(202,'ë','\\\'eb','e with umlaut'),
-	(203,'ñ','\\\'f1','n with tilde'),
-	(204,'±','\\\'b1','plus or minus'),
-	(205,'”','\\\'94','double quotes'),
-	(206,'×','\\\'d7','x'),
-	(207,'Å','\\\'c5','a with circle'),
-	(208,'↔','\\u8596 ','double arrow'),
-	(209,'ä','\\u228 ','a with umlaut'),
-	(210,'“','\\\'81\\\'67','double quotes'),
-	(211,'•','\\u8226 ','bullet'),
-	(212,'∗','\\u8727 ','star'),
-	(213,'{','\\{','left brace'),
-	(214,'}','\\}','right brace'),
-	(215,'¹','\\\'b9','superscript 1'),
-	(216,'²','\\\'b2','superscript 2'),
-	(217,'³','\\\'b3','superscript 3'),
-	(218,'⁴','\\u8308 ','superscript 4'),
-	(219,'⁵','\\u8309 ','superscript 5'),
-	(220,'⁶','\\u8310 ','superscript 6'),
-	(221,'⁷','\\u8311 ','superscript 7'),
-	(222,'⁸','\\u8312 ','superscript 8'),
-	(223,'⁹','\\u8313 ','superscript 9'),
-	(224,'⁰','\\u8304 ','superscript 0'),
-	(225,'₁','\\u8321 ','subscript 1'),
-	(226,'₂','\\u8322 ','subscript 2'),
-	(227,'₃','\\u8323 ','subscript 3'),
-	(228,'₄','\\u8324 ','subscript 4'),
-	(229,'₅','\\u8325 ','subscript 5'),
-	(230,'₆','\\u8326 ','subscript 6'),
-	(231,'₇','\\u8327 ','subscript 7'),
-	(232,'₈','\\u8328 ','subscript 8'),
-	(233,'₉','\\u8329 ','subscript 9'),
-	(234,'₀','\\u8320 ','subscript 0'),
-	(235,'~','\\u8764 ','tilde'),
-	(236,'⁺','\\u8314 ','superscript plus'),
-	(237,'✰','\\u10032 ','star'),
-	(238,'·','\\\'b7','dot'),
-	(239,'–','\\\'96','dash'),
-	(240,'∩','\\u8745 ','intersection'),
-	(241,'‑','\\u8209 ','dash'),
-	(242,'☆','\\u9734 ','star'),
-	(243,'ɛ','\\u603 ','backwards 3'),
-	(244,'ô','\\\'f4','o with hat'),
-	(245,'ﬁ','\\u64257 ','fi or something'),
-	(246,'ĸ','\\u312 ','k or something'),
-	(247,'ń','\\u324 ','accented n'),
-	(248,'″','\\u8243 ','quote'),
-	(249,'⁻','\\u8315 ','dash or something'),
-	(250,'‒','\\u8210 ','dash or something'),
-	(251,'ů','\\u367 ','u circle'),
-	(252,'√','\\u8730 ','checkmark'),
-	(253,'‘','\\\'91','apostrophe'),
-	(254,'ø','\\\'f8','o with slash'),
-	(255,'ú','\\\'fa','accented u'),
-	(256,'č','\\u269 ','c with caret'),
-	(257,'ć','\\u263 ','c with accent'),
-	(258,'ğ','\\u287 ','g with caret'),
-	(259,'ā','\\u257 ','a with line on top'),
-	(260,'õ','\\\'f5','o with tilde'),
-	(261,'ś','\\u347 ','accented s'),
-	(262,'î','\\\'ee','i with caret on top'),
-	(263,'ş','\\u351 ','s with squiggle'),
-	(264,'Ş','\\u350 ','capital s with squiggle'),
-	(265,'ʼ','\\u700 ','apostrophe'),
-	(266,'â','\\\'e2','a with caret'),
-	(267,'ı','\\u305 ','little i'),
-	(268,'ė','\\u279 ','e with dot'),
-	(269,'ł','\\u322 ','I with slash'),
-	(270,'ą','\\u261 ','a with squiggle'),
-	(271,'ę','\\u281 ','french e'),
-	(272,'ĭ','\\u301 ','i with half circle'),
-	(273,'ň','\\u328 ','n with caret'),
-	(274,'İ','\\u304 ','i with dot on top'),
-	(275,'ě','\\u283 ','e with caret'),
-	(276,'ǧ','\\u487 ','g with caret'),
-	(277,'ő','\\u337 ','o with two accents'),
-	(278,'û','\\\'fb','u with caret'),
-	(279,'ý','\\\'fd','y with accent'),
-	(280,'ź','\\u378 ','z with accent'),
-	(281,'ż','\\u380 ','z with dot'),
-	(282,'ű','\\u369 ','u with two accents'),
-	(283,'ŏ','\\u335 ','o with half circle'),
-	(284,'ī','\\u299 ','i with line on top'),
-	(285,'ӧ','\\u1255 ','o with umlaut'),
-	(286,'Đ','\\u272 ','d with slash'),
-	(287,'ř','\\u345 ','r with caret'),
-	(288,'ˇ','\\u711 ','caret'),
-	(289,'ă','\\u259 ','a with caret'),
-	(290,'ŕ','\\u341 ','r with accent'),
-	(291,'ĕ','\\u277 ','e with caret'),
-	(292,'ό','\\u972 ','o with accent'),
-	(293,'ũ','\\u361 ','u with tilde'),
-	(294,'׳','\\\'d7','apostrophe');
-UNLOCK TABLES;
-
-LOCK TABLES `admin_settings` WRITE;
-TRUNCATE `admin_settings`;
-INSERT INTO `admin_settings`
-(`viewName`, `viewAttributes`, `viewLabel`)
-VALUES('displayMessages', '[{"labelUserKey":"messages","helpTextSettingsView":"Controls the displying of the success or error messages","isVisible":true}]', 'Display Messages');
-INSERT INTO `admin_settings`
-(`viewName`, `viewAttributes`, `viewLabel`)
-VALUES('EmailNotifications', '[{"labelUserKey":"emailNotifications","labelSettingsView":"Enable email notifications","helpTextSettingsView":"Check to enable for all users. Once enabled, users with the \\"curator_self role\\" and a registered email will have the option to \\"Manage notifications.\\"","isVisible":true},{"labelUserKey":"emailSender","labelSettingsView":"Email sender","helpTextSettingsView":"Specify the \\"from\\" email address for the notifications. You can use either of these formats \\"publications@med.cornell.edu\\" or \\"WCM Publications <publications@med.cornell.edu>\\"","labelUserView":"publications@med.cornell.edu"},{"labelUserKey":"emailSalutation","labelSettingsView":"Email body: salutation","labelUserView":"","helpTextSettingsView":"This text is the greeting portion of the email notification."},{"labelUserKey":"acceptedSubjectHeader","labelSettingsView":"Email body: \\"Accepted\\" section prefix","helpTextSettingsView":"This text precedes the list of any publications that have been accepted on behalf of a given person.","labelUserView":"The following publications have been accepted on your behalf"},{"labelUserKey":"suggestedSubjectHeader","labelSettingsView":"Email body: \\"Suggested\\" section prefix","helpTextSettingsView":"This text precedes the list of any publications that have been suggested for a given person.","labelUserView":"The following publications are pending for you"},{"labelUserKey":"acceptedEmailNotificationsLimit","labelSettingsView":"Email body: max accepted articles","helpTextSettingsView":"Select the maximum number of accepted publications to display in an email. We recommend a limit of 10. Note that this section excludes publications that have been accepted by the user themselves.","maxLimit":"5"},{"labelUserKey":"suggestedEmailNotificationsLimit","labelSettingsView":"Email body: max suggested articles","helpTextSettingsView":"Select the maximum number of suggested publications to display in an email. We recommend a limit of 10.","maxLimit":"10"},{"labelUserKey":"emailSignature","labelUserView":"Sincerely,\\nSamuel J. Wood Library\\nWeill Cornell Medicine\\n","labelSettingsView":"Email signature","helpTextSettingsView":"Define the signature that will appear at the end of the email."},{"labelUserKey":"testemailfunctionality","labelSettingsView":"Test emailing functionality","helpTextSettingsView":"Here you can test the email user functionality by inputting a person identifier, an email address recipient, and then clicking on \\"Send test email\\". If the Email Override field is blank, users will be contacted at their email of record as stored in the admin_users table.If the Email Override checkbox is selected, all notification emails from any regularly scheduled job will be sent to the email address specified in this field.","personIdentifier":"acs2001","emailOverride":"reciter2024@med.cornell.edu","useEmailForScheduledJobs":true,"submitButton":""}]', 'Email Notifications');
-INSERT INTO `admin_settings`
-(`viewName`, `viewAttributes`, `viewLabel`)
-VALUES('findPeople', '[{"labelUserKey":"personIdentifier","labelUserView":"CWID","labelSettingsView":"Label for person identifier","helpTextSettingsView":"e.g.,NetID, CWID. Used throughout the application. "},{"labelUserKey":"organization","labelUserView":"Organization(s)","labelSettingsView":"Label for organizational unit","helpTextSettingsView":"e.g., Pediatrics"},{"labelUserKey":"institution","labelUserView":"Institution(s)","labelSettingsView":"Label for institution","helpTextSettingsView":"e.g., Cornell University"},{"labelUserKey":"personType","labelUserView":"Person Type(s)","labelSettingsView":"Label for person type","helpTextSettingsView":"e.g., academic-faculty,student-phd"}]', 'Find People');
-INSERT INTO `admin_settings`
-(`viewName`, `viewAttributes`, `viewLabel`)
-VALUES('headshot', '[{"labelUserKey":"headshot","labelUserView":"Headshot","labelSettingsView":"Headshot","helpTextSettingsView":"Include the full URL for a third party headshot API where a token a personIdentifier is enclosed by braces","syntax":"https://directory.weill.cornell.edu/api/v1/person/profile/{personIdentifier}.png?returnGenericOn404=false"}]', 'Headshot');
-INSERT INTO `admin_settings`
-(`viewName`, `viewAttributes`, `viewLabel`)
-VALUES('reportingArticleCSV', '[{"labelUserKey":"reportingArticleCSVLimit","labelSettingsView":"Maximum records for export to article CSV","helpTextSettingsView":"","maxLimit":"500000"},{"labelUserKey":"pmid","labelUserView":"PMID","labelSettingsView":"PMID","helpTextSettingsView":"","displayRank":"21","isVisible":true},{"labelUserKey":"Article title","labelUserView":"Article title","labelSettingsView":"Article title","helpTextSettingsView":"","displayRank":"2","isVisible":true},{"labelUserKey":"Article year","labelUserView":"Article year","labelSettingsView":"Article year","helpTextSettingsView":"","displayRank":"3","isVisible":true},{"labelUserKey":"pmcid","labelUserView":"PMCID","labelSettingsView":"PMCID","helpTextSettingsView":"","displayRank":4,"isVisible":true},{"labelUserKey":"Publication date display","labelUserView":"Publication date display","labelSettingsView":"Publication date display","helpTextSettingsView":"","displayRank":5,"isVisible":true},{"labelUserKey":"Date standardized","labelUserView":"Publication date standardized","labelSettingsView":"Publication date standardized","helpTextSettingsView":"","displayRank":6,"isVisible":true},{"labelUserKey":"Date added","labelUserView":"Date added","labelSettingsView":"Date added","helpTextSettingsView":"","displayRank":7,"isVisible":true},{"labelUserKey":"Journal","labelUserView":"Journal Title Verbose","labelSettingsView":"Journal Title Verbose","helpTextSettingsView":"","displayRank":8,"isVisible":true},{"labelUserKey":"doi","labelUserView":"DOI","labelSettingsView":"DOI","helpTextSettingsView":"","displayRank":9,"isVisible":true},{"labelUserKey":"Issue","labelUserView":"Issue","labelSettingsView":"Issue","helpTextSettingsView":"","displayRank":10,"isVisible":true},{"labelUserKey":"Pages","labelUserView":"Pages","labelSettingsView":"Pages","helpTextSettingsView":"","displayRank":11,"isVisible":true},{"labelUserKey":"Volume","labelUserView":"Volume","labelSettingsView":"Volume","helpTextSettingsView":"","displayRank":12,"isVisible":true},{"labelUserKey":"Scimago Journal Rank","labelUserView":"Scimago Journal Rank","labelSettingsView":"Scimago Journal Rank","helpTextSettingsView":"","displayRank":13,"isVisible":true},{"labelUserKey":"Mendeley readers","labelUserView":"Mendeley readers","labelSettingsView":"Mendeley readers","helpTextSettingsView":"","displayRank":14,"isVisible":true},{"labelUserKey":"NIH Relative Citation Ratio","labelUserView":"NIH Relative Citation Ratio","labelSettingsView":"NIH Relative Citation Ratio","helpTextSettingsView":"","displayRank":15,"isVisible":true},{"labelUserKey":"NIH Percentile Rank","labelUserView":"NIH Percentile Rank","labelSettingsView":"NIH Percentile Rank","helpTextSettingsView":"","displayRank":16,"isVisible":true},{"labelUserKey":"TrendingPubs score","labelUserView":"TrendingPubs score","labelSettingsView":"TrendingPubs score","helpTextSettingsView":"","displayRank":17,"isVisible":false},{"labelUserKey":"Citation count (NIH)","labelUserView":"Citation count (NIH)","labelSettingsView":"Citation count (NIH)","helpTextSettingsView":"","displayRank":18,"isVisible":true},{"labelUserKey":"Citation count (Scopus)","labelUserView":"Citation count (Scopus)","labelSettingsView":"Citation count (Scopus)","helpTextSettingsView":"","displayRank":19,"isVisible":false},{"labelUserKey":"Authors","labelUserView":"Author(s)","labelSettingsView":"Author(s)","helpTextSettingsView":"","displayRank":"1","isVisible":true},{"labelUserKey":"Journal Metric","labelUserView":"Journal Metric","labelSettingsView":"Journal Metric","helpTextSettingsView":"An additional journal-level metric","displayRank":"20","isVisible":true},{"labelUserKey":"Author Position","labelUserView":"Author Position","labelSettingsView":"Author Position","helpTextSettingsView":"Author Position","displayRank":"21","isVisible":true}]', 'Reporting Article CSV');
-INSERT INTO `admin_settings`
-(`viewName`, `viewAttributes`, `viewLabel`)
-VALUES('reportingArticleRTF', '[{"labelUserKey":"reportingArticleRTFLimit","labelSettingsView":"Maximum records for export to article RTF","helpTextSettingsView":"Maximum number of article records a user can export to RTF. In testing, we have found the export fails after 40,000 records. ","maxLimit":"1000","isValidate":false,"errorMessage":"Limit cannot exceed 40000"}]', 'Reporting Article RTF');
-INSERT INTO `admin_settings`
-(`viewName`, `viewAttributes`, `viewLabel`)
-VALUES('reportingAuthorshipCSV', '[{"labelUserKey":"reportingAuthorshipCSVLimit","labelSettingsView":"Maximum records for export to article CSV","helpTextSettingsView":"","maxLimit":"500000"},{"labelUserKey":"personIdentifier","labelUserView":"PersonIdentifier","labelSettingsView":"CWID","helpTextSettingsView":"","displayRank":"1","isVisible":false},{"labelUserKey":"Last Name","labelUserView":"Last Name","labelSettingsView":"Last Name","helpTextSettingsView":"","displayRank":"2","isVisible":false},{"labelUserKey":"First Name","labelUserView":"First Name","labelSettingsView":"First Name","helpTextSettingsView":"","displayRank":"3","isVisible":false},{"labelUserKey":"Organization","labelUserView":"Organization","labelSettingsView":"Organization","helpTextSettingsView":"","displayRank":"4","isVisible":true},{"labelUserKey":"Institution","labelUserView":"Institution","labelSettingsView":"Institution","helpTextSettingsView":"","displayRank":"5","isVisible":true},{"labelUserKey":"Author Position","labelUserView":"Author Position","labelSettingsView":"Author Position","helpTextSettingsView":"Author Position","displayRank":"6","isVisible":true},{"labelUserKey":"pmid","labelUserView":"PMID","labelSettingsView":"PMID","helpTextSettingsView":"","displayRank":7,"isVisible":true},{"labelUserKey":"Article title","labelUserView":"Article title","labelSettingsView":"Article title","helpTextSettingsView":"","displayRank":8,"isVisible":true},{"labelUserKey":"Article year","labelUserView":"Article year","labelSettingsView":"Article year","helpTextSettingsView":"","displayRank":9,"isVisible":true},{"labelUserKey":"pmcid","labelUserView":"PMCID","labelSettingsView":"PMCID","helpTextSettingsView":"","displayRank":10,"isVisible":true},{"labelUserKey":"Publication date display","labelUserView":"Publication date display","labelSettingsView":"Publication date display","helpTextSettingsView":"","displayRank":11,"isVisible":true},{"labelUserKey":"Date standardized","labelUserView":"Date standardized","labelSettingsView":"Publication date standardized","helpTextSettingsView":"","displayRank":12,"isVisible":true},{"labelUserKey":"Date added","labelUserView":"Date added","labelSettingsView":"Date added","helpTextSettingsView":"","displayRank":13,"isVisible":true},{"labelUserKey":"Journal","labelUserView":"Journal title","labelSettingsView":"Journal title","helpTextSettingsView":"","displayRank":14,"isVisible":true},{"labelUserKey":"doi","labelUserView":"DOI","labelSettingsView":"DOI","helpTextSettingsView":"","displayRank":15,"isVisible":true},{"labelUserKey":"Issue","labelUserView":"Issue","labelSettingsView":"Issue","helpTextSettingsView":"","displayRank":16,"isVisible":true},{"labelUserKey":"Pages","labelUserView":"Pages","labelSettingsView":"Pages","helpTextSettingsView":"","displayRank":17,"isVisible":true},{"labelUserKey":"Volume","labelUserView":"Volume","labelSettingsView":"Volume","helpTextSettingsView":"","displayRank":18,"isVisible":true},{"labelUserKey":"Scimago Journal Rank","labelUserView":"Scimago Journal Rank","labelSettingsView":"Scimago Journal Rank","helpTextSettingsView":"","displayRank":19,"isVisible":true},{"labelUserKey":"Mendeley readers","labelUserView":"Mendeley readers","labelSettingsView":"Mendeley readers","helpTextSettingsView":"","displayRank":20,"isVisible":true},{"labelUserKey":"NIH Relative Citation Ratio","labelUserView":"NIH Relative Citation Ratio","labelSettingsView":"NIH Relative Citation Ratio","helpTextSettingsView":"","displayRank":21,"isVisible":true},{"labelUserKey":"NIH Percentile Rank","labelUserView":"NIH Percentile Rank","labelSettingsView":"NIH Percentile Rank","helpTextSettingsView":"","displayRank":22,"isVisible":true},{"labelUserKey":"TrendingPubs score","labelUserView":"TrendingPubs score","labelSettingsView":"TrendingPubs score","helpTextSettingsView":"","displayRank":23,"isVisible":false},{"labelUserKey":"Citation count (NIH)","labelUserView":"Citation count (NIH)","labelSettingsView":"Citation count (NIH) ","helpTextSettingsView":"","displayRank":24,"isVisible":true},{"labelUserKey":"Citation count (Scopus)","labelUserView":"Citation count (Scopus) ","labelSettingsView":"Citation count (Scopus) ","helpTextSettingsView":"","displayRank":25,"isVisible":false},{"labelUserKey":"Person types","labelUserView":"Person type(s)","labelSettingsView":"Person type(s)","helpTextSettingsView":"","displayRank":26,"isVisible":true},{"labelUserKey":"Authors","labelUserView":"Author(s)","labelSettingsView":"Author(s)","helpTextSettingsView":"","displayRank":27,"isVisible":true},{"labelUserKey":"Journal","labelUserView":"Journal Title Verbose","labelSettingsView":"Journal Title Verbose","helpTextSettingsView":"","displayRank":28,"isVisible":true},{"labelUserKey":"Journal Metric","labelUserView":"Journal Metric","labelSettingsView":"Journal Metric","helpTextSettingsView":"An additional journal-level metric","displayRank":"20","isVisible":true}]', 'Reporting Authorship CSV');
-INSERT INTO `admin_settings`
-(`viewName`, `viewAttributes`, `viewLabel`)
-VALUES('reportingFilters', '[{"labelUserKey":"Author","labelUserView":"Author","labelSettingsView":"Author","helpTextSettingsView":""},{"labelUserKey":"Organization","labelUserView":"Organization","labelSettingsView":"Organization","helpTextSettingsView":""},{"labelUserKey":"Institution","labelUserView":"Institution","labelSettingsView":"Institution","helpTextSettingsView":""},{"labelUserKey":"Person Type","labelUserView":"Person Type","labelSettingsView":"Person Type","helpTextSettingsView":""},{"labelUserKey":"Author Position","labelUserView":"Author Position","labelSettingsView":"Author Position","helpTextSettingsView":""},{"labelUserKey":"Date","labelUserView":"Date","labelSettingsView":"Date","helpTextSettingsView":""},{"labelUserKey":"Type","labelUserView":"Type","labelSettingsView":"Type","helpTextSettingsView":""},{"labelUserKey":"Journal","labelUserView":"Journal","labelSettingsView":"Journal","helpTextSettingsView":""},{"labelUserKey":"Journal Rank","labelUserView":"Journal Rank","labelSettingsView":"Journal Rank","helpTextSettingsView":""}]', 'Reporting Filters');
-INSERT INTO `admin_settings`
-(`viewName`, `viewAttributes`, `viewLabel`)
-VALUES('reportingWebDisplay', '[{"labelUserKey":"Citation count (NIH)","labelUserView":"Citation count (NIH)","labelSettingsView":"Citation count (NIH)","helpTextUserView":"This is the test number of citations an article has received from CrossRef,  MEDLINE,  PubMed Central,  and Entrez. NIH citation counts generally correlate closely with the counts in Scopus and Web of Knowledge. ","helpTextSettingsView":"","displayRank":"3","isVisible":false},{"labelUserKey":"Citation count (Scopus)","labelUserView":"Citation count (Scopus)","labelSettingsView":"Citation count (Scopus)","helpTextUserView":"","helpTextSettingsView":"","displayRank":"1","isVisible":true},{"labelUserKey":"Percentile Rank","labelUserView":"Percentile Rank","labelSettingsView":"NIH Percentile Rank","helpTextUserView":"NIH percentile is the value of RCR provided as a percentile in which 100 is the highest and 0 is the lowest. For example, if an article has an NIH percentile of 63.2, it has received more citations than 631 articles when measured against a field and time-weighted benchmark of 1,000 NIH-funded research articles from the same year. A percentile is not computed for an article published in the past two years.","helpTextSettingsView":"","displayRank":"2","isVisible":true},{"labelUserKey":"Relative Citation Ratio (NIH)","labelUserView":"Relative Citation Ratio (NIH)","labelSettingsView":"Relative Citation Ratio (NIH)","helpTextSettingsView":"","helpTextUserView":"Relative Citation Ratio (RCR) is the ratio between the number of times an article was cited in comparison to publications of the same date and field (as inferred by co-citation networks). A value of 1.0 is the median. Higher is better. The benchmark consists of research articles that are the product of R01 grants, the NIH''''s most prestigious and competitive funding mechanism.","displayRank":4,"isVisible":true},{"labelUserKey":"Journal Rank","labelUserView":"Journal Rank","labelSettingsView":"Scimago Journal Rank","helpTextUserView":"SCImago Journal Rank is a measure of the relative number of inbound citations articles in a given journal receive compared to outbound citations. It is closely correlated with Journal Impact Factor.","helpTextSettingsView":"Journal Rank Help text","displayRank":"5","isVisible":true},{"labelUserKey":"Journal Metric","labelUserView":"Journal Impact","labelSettingsView":"Journal Metric","helpTextUserView":"","helpTextSettingsView":"","displayRank":6,"isVisible":true},{"labelUserKey":"readersMendeley","labelUserView":"Mendeley readers","labelSettingsView":"Mendeley readers","helpTextUserView":"","helpTextSettingsView":"","displayRank":7,"isVisible":false},{"labelUserKey":"TrendingPubs score","labelUserView":"TrendingPubs score","labelSettingsView":"TrendingPubs score","helpTextUserView":"","helpTextSettingsView":"","displayRank":8,"isVisible":false}]', 'Reporting Web Display');
-INSERT INTO `admin_settings`
-(`viewName`, `viewAttributes`, `viewLabel`)
-VALUES('reportingWebViewSort', '[{"labelUserKey":"datePublicationAddedToEntrez","labelUserView":"Date added","labelSettingsView":"Date added","helpTextSettingsView":"","displayRank":"2","isVisible":true},{"labelUserKey":"citationCountNIH","labelUserView":"Citation count (NIH)","labelSettingsView":"Citation count (NIH)","displayRank":"1","helpTextSettingsView":"","isVisible":true},{"labelUserKey":"citationCountScopus","labelUserView":"Citation count (Scopus)","labelSettingsView":"Citation count (Scopus)","helpTextSettingsView":"","displayRank":"3","isVisible":true},{"labelUserKey":"journalImpactScore1","labelUserView":"Scimago Journal Rank","labelSettingsView":"Scimago Journal Rank","helpTextSettingsView":"","isVisible":true,"displayRank":"4"},{"labelUserKey":"journalImpactScore2","labelUserView":"Journal Metric","labelSettingsView":"Journal Metric","helpTextSettingsView":"","displayRank":"5","isVisible":true},{"labelUserKey":"readersMendeley","labelUserView":"Mendeley readers","labelSettingsView":"Mendeley readers","helpTextSettingsView":"","isVisible":false,"displayRank":"6"},{"labelUserKey":"percentileNIH","labelUserView":"NIH Percentile Rank","labelSettingsView":"NIH Percentile Rank","helpTextSettingsView":"","isVisible":true,"displayRank":"7"},{"labelUserKey":"relativeCitationRatioNIH","labelUserView":"Relative Citation Ratio (NIH)","labelSettingsView":"Relative Citation Ratio (NIH)","helpTextSettingsView":"","isVisible":true,"displayRank":"8"},{"labelUserKey":"trendingPubsScore","labelUserView":"TrendingPubs score","labelSettingsView":"TrendingPubs score","helpTextSettingsView":"","isVisible":false,"displayRank":"9"}]', 'Reporting Web View Sort');
-INSERT INTO `admin_settings`
-(`viewName`, `viewAttributes`, `viewLabel`)
-VALUES('userRoles', '[{"labelUserKey":"Roles````","labelSettingsView":"Select the role","inputType":"check","isRoleGroup":false,"roles":[{"roleId":3,"roleName":"Repoter_All","roleLabel":"Automatically grant all successfully authenticated users the reporter_all role.","isChecked":true},{"roleId":2,"roleName":"Curator_All","roleLabel":"Automatically grant all successfully authenticated users the curator_all role","isChecked":false}]}]', 'User Default Role(s)');
-INSERT INTO `admin_settings`
-(`viewName`, `viewAttributes`, `viewLabel`)
-VALUES('viewProfile', '[{"labelUserKey":"h-index","labelUserView":"h-index","labelSettingsView":"h-index (NIH)","helpTextUserView":"h-index is the number of an author''s articles in PubMed that have been cited, as defined by NIH''s iCite service, at least that many times. ","helpTextSettingsView":""},{"labelUserKey":"h5-index","labelUserView":"h5-index","labelSettingsView":"h5-index (NIH)","helpTextUserView":"h5-index is the number of an author''s articles in PubMed that have been cited,  as defined by NIH''s iCite service, at least that many times within the past 5 years. ","helpTextSettingsView":""},{"labelUserKey":"hindexScopus","labelUserView":"h-index (Scopus)","labelSettingsView":"h-index (Scopus)","helpTextUserView":"h-index is the number of an author''s articles in Scopus that have been cited, as defined by Scopus, at least that many times.","helpTextSettingsView":""},{"labelUserKey":"h5IndexScopus","labelUserView":"h5-index (Scopus)","labelSettingsView":"h5-index (Scopus)","helpTextUserView":"h-index is the number of an author''s articles in Scopus published in the last five years that have been cited, as defined by Scopus, at least that many times.","helpTextSettingsView":""}]', 'View Profile');
-UNLOCK TABLES;
diff --git a/setup/table_admin_roles.sql b/setup/table_admin_roles.sql
new file mode 100644
index 0000000..50dcf8b
--- /dev/null
+++ b/setup/table_admin_roles.sql
@@ -0,0 +1,9 @@
+LOCK TABLES `admin_roles` WRITE;
+INSERT INTO `admin_roles` (`roleID`, `roleLabel`) VALUES 
+  (1,'Superuser'),
+  (2,'Curator_All'),
+  (3,'Reporter_All'),
+  (4,'Curator_Self'),
+  (5,'Curator_Department'),
+  (6,'Curator_Department_Delegate');
+UNLOCK TABLES;
\ No newline at end of file
diff --git a/update/temp/.gitkeep b/update/temp/.gitkeep
deleted file mode 100644
index e69de29..0000000

From 413043cd0c2c918bc85bfaba21d4ae66691188bd Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Thu, 26 Mar 2026 16:27:33 -0400
Subject: [PATCH 03/19] fix(percentile): replace broken
 percentile/rank/denominator logic in standalone SP

STEP 4: Replace person.title with person_person_type LEFT JOIN chain
(COALESCE) for facultyRank derivation. Replace article counts that
counted all articles with filtered counts (Research Article +
percentileNIH IS NOT NULL only).

STEP 5: Replace all 8 combined UPDATE statements (using wrong
threshold-percentage formula) with 24 separate UPDATE statements
matching the canonical createEventsProceduresReciterDb.sql v2:
- Percentile: AVG of top N articles ranked by percentileNIH DESC
- Denominator: Count of peers at same facultyRank meeting thresholds
- Rank: RANK() OVER (PARTITION BY facultyRank ORDER BY percentile DESC)

All 8 metrics (top5/10 x All/First/Senior/FirstSenior) now match the
canonical logic. Steps 1-3, 6, 7 verified identical and unchanged.

Ref: createEventsProceduresReciterDb.sql lines 3720-4252
---
 setup/populateAnalysisSummaryTables_v2.sql | 600 ++++++++++++++++-----
 1 file changed, 460 insertions(+), 140 deletions(-)

diff --git a/setup/populateAnalysisSummaryTables_v2.sql b/setup/populateAnalysisSummaryTables_v2.sql
index de2bfbd..b8807e5 100644
--- a/setup/populateAnalysisSummaryTables_v2.sql
+++ b/setup/populateAnalysisSummaryTables_v2.sql
@@ -564,53 +564,106 @@ proc_main: BEGIN
 
     CALL log_progress(v_job_id, v_step, 'Complete', 'DONE', NULL, CONCAT(TIMESTAMPDIFF(SECOND, v_start_time, NOW()), 's elapsed'));
 
+
     -- ========================================================================
     -- STEP 4: Populate analysis_summary_person_new
     -- ========================================================================
     SET v_step = '4. Populate analysis_summary_person';
     CALL log_progress(v_job_id, v_step, 'Inserting person records', 'RUNNING', NULL, NULL);
 
-    INSERT INTO analysis_summary_person_new (personIdentifier, nameFirst, nameMiddle, nameLast, facultyRank, department)
-    SELECT DISTINCT
-        p.personIdentifier,
-        p.firstName,
-        p.middleName,
-        p.lastName,
-        p.title,
-        p.primaryOrganizationalUnit
-    FROM person p
-    JOIN analysis_summary_person_scope s ON s.personIdentifier = p.personIdentifier;
+    -- Populate using person_person_type to derive facultyRank
+    INSERT INTO analysis_summary_person_new (personIdentifier, nameFirst, nameMiddle, nameLast, department, facultyRank)
+    SELECT * FROM (
+        SELECT DISTINCT
+            p.personIdentifier,
+            p.firstName AS nameFirst,
+            p.middleName AS nameMiddle,
+            p.lastName AS nameLast,
+            p.primaryOrganizationalUnit AS department,
+            COALESCE(a.facultyRank, b.facultyRank, c.facultyRank, d.facultyRank) AS facultyRank
+        FROM person p
+
+        LEFT JOIN (
+            SELECT personIdentifier, 'Full Professor' AS facultyRank
+            FROM person_person_type
+            WHERE personType = 'academic-faculty-fullprofessor'
+        ) a ON a.personIdentifier = p.personIdentifier
+
+        LEFT JOIN (
+            SELECT personIdentifier, 'Associate Professor' AS facultyRank
+            FROM person_person_type
+            WHERE personType = 'academic-faculty-associate'
+        ) b ON b.personIdentifier = p.personIdentifier
+
+        LEFT JOIN (
+            SELECT personIdentifier, 'Assistant Professor' AS facultyRank
+            FROM person_person_type
+            WHERE personType = 'academic-faculty-assistant'
+        ) c ON c.personIdentifier = p.personIdentifier
+
+        LEFT JOIN (
+            SELECT personIdentifier, 'Instructor or Lecturer' AS facultyRank
+            FROM person_person_type
+            WHERE personType IN ('academic-faculty-instructor', 'academic-faculty-lecturer')
+        ) d ON d.personIdentifier = p.personIdentifier
+
+        INNER JOIN analysis_summary_person_scope e ON e.personIdentifier = p.personIdentifier
+    ) x
+    WHERE facultyRank IS NOT NULL;
 
     SET v_rows = ROW_COUNT();
     CALL log_progress(v_job_id, v_step, 'Inserted person records', 'INFO', v_rows, NULL);
 
-    -- Update article counts
+    -- ========================================================================
+    -- STEP 4b: Compute article counts
+    -- Counts are for articles with publicationTypeNIH = 'Research Article'
+    -- and percentileNIH is not null
+    -- ========================================================================
     CALL log_progress(v_job_id, v_step, 'Updating article counts', 'RUNNING', NULL, NULL);
+
+    -- countAll: Count of research articles with NIH percentile
     UPDATE analysis_summary_person_new p
     JOIN (
-        SELECT personIdentifier, COUNT(DISTINCT pmid) AS cnt
-        FROM analysis_summary_author_new
-        GROUP BY personIdentifier
-    ) c ON c.personIdentifier = p.personIdentifier
-    SET p.countAll = c.cnt;
+        SELECT s.personIdentifier, COUNT(a1.pmid) AS count
+        FROM analysis_summary_person_new s
+        JOIN analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier
+        Join analysis_summary_article_new a1 ON a1.pmid = a.pmid
+        WHERE publicationTypeNIH = 'Research Article' AND percentileNIH IS NOT NULL
+        GROUP BY s.personIdentifier
+    ) x ON x.personIdentifier = p.personIdentifier
+    SET p.countAll = x.count;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated countAll', 'INFO', v_rows, NULL);
 
+    -- countFirst: Count of first-authored research articles with NIH percentile
     UPDATE analysis_summary_person_new p
     JOIN (
-        SELECT personIdentifier, COUNT(DISTINCT pmid) AS cnt
-        FROM analysis_summary_author_new
-        WHERE authorPosition = 'first'
-        GROUP BY personIdentifier
-    ) c ON c.personIdentifier = p.personIdentifier
-    SET p.countFirst = c.cnt;
+        SELECT s.personIdentifier, COUNT(a1.pmid) AS count
+        FROM analysis_summary_person_new s
+        Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier
+        Join analysis_summary_article_new a1 ON a1.pmid = a.pmid
+        WHERE publicationTypeNIH = 'Research Article' AND percentileNIH IS NOT NULL
+          AND a.authorPosition = 'first'
+        GROUP BY s.personIdentifier
+    ) x ON x.personIdentifier = p.personIdentifier
+    SET p.countFirst = x.count;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated countFirst', 'INFO', v_rows, NULL);
 
+    -- countSenior: Count of senior/last-authored research articles with NIH percentile
     UPDATE analysis_summary_person_new p
     JOIN (
-        SELECT personIdentifier, COUNT(DISTINCT pmid) AS cnt
-        FROM analysis_summary_author_new
-        WHERE authorPosition = 'last'
-        GROUP BY personIdentifier
-    ) c ON c.personIdentifier = p.personIdentifier
-    SET p.countSenior = c.cnt;
+        SELECT s.personIdentifier, COUNT(a1.pmid) AS count
+        FROM analysis_summary_person_new s
+        Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier
+        Join analysis_summary_article_new a1 ON a1.pmid = a.pmid
+        WHERE publicationTypeNIH = 'Research Article' AND percentileNIH IS NOT NULL
+          AND a.authorPosition = 'last'
+        GROUP BY s.personIdentifier
+    ) x ON x.personIdentifier = p.personIdentifier
+    SET p.countSenior = x.count;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated countSenior', 'INFO', v_rows, NULL);
 
     IF v_error_occurred THEN
         CALL log_progress(v_job_id, v_step, 'Failed', 'ERROR', NULL, v_error_message);
@@ -621,152 +674,419 @@ proc_main: BEGIN
     CALL log_progress(v_job_id, v_step, 'Complete', 'DONE', NULL, CONCAT(TIMESTAMPDIFF(SECOND, v_start_time, NOW()), 's elapsed'));
 
     -- ========================================================================
-    -- STEP 5: Compute percentile rankings (with rank and denominator)
+    -- STEP 5: Compute percentile rankings (peer-based)
+    -- Percentile = average of top N articles by percentileNIH
+    -- Denominator = count of people with same facultyRank who have the metric
+    -- Rank = rank within facultyRank by percentile value
     -- ========================================================================
     SET v_step = '5. Compute percentile rankings';
-    CALL log_progress(v_job_id, v_step, 'Computing percentiles (8 metrics with rank/denominator)', 'RUNNING', NULL, NULL);
+    CALL log_progress(v_job_id, v_step, 'Computing percentiles (peer-based avg of top N)', 'RUNNING', NULL, NULL);
+
+    -- ========================================================================
+    -- 5a. TOP 5 PERCENTILE - ALL POSITIONS
+    -- ========================================================================
 
-    -- Top 5 percentile, first/last authored
+    -- top5PercentileAll: Average of top 5 percentiles (requires countAll > 4)
     UPDATE analysis_summary_person_new p
     JOIN (
-        SELECT
-            a.personIdentifier,
-            ROUND(100 * SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct,
-            SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) AS rank_count,
-            COUNT(*) AS denominator
-        FROM analysis_summary_author_new a
-        JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid
-        WHERE percentileNIH IS NOT NULL
-          AND authorPosition IN ('first', 'last')
-        GROUP BY a.personIdentifier
+        SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH
+        FROM (
+            SELECT s.personIdentifier, a1.pmid, a1.percentileNIH,
+                   RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank
+            FROM analysis_summary_person_new s
+            JOIN analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier
+            Join analysis_summary_article_new a1 ON a1.pmid = a.pmid
+            WHERE a1.percentileNIH IS NOT NULL AND s.countAll > 4
+        ) y
+        WHERE article_rank < 6
+        GROUP BY personIdentifier
     ) x ON x.personIdentifier = p.personIdentifier
-    SET p.top5PercentileFirstSenior = x.pct,
-        p.top5RankFirstSenior = x.rank_count,
-        p.top5DenominatorFirstSenior = x.denominator;
+    SET p.top5PercentileAll = x.percentileNIH;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top5PercentileAll', 'INFO', v_rows, NULL);
 
-    -- Top 10 percentile, first/last authored
+    -- top5DenominatorAll: Count of people in same facultyRank with valid top5PercentileAll
     UPDATE analysis_summary_person_new p
     JOIN (
-        SELECT
-            a.personIdentifier,
-            ROUND(100 * SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct,
-            SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) AS rank_count,
-            COUNT(*) AS denominator
-        FROM analysis_summary_author_new a
-        JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid
-        WHERE percentileNIH IS NOT NULL
-          AND authorPosition IN ('first', 'last')
-        GROUP BY a.personIdentifier
+        SELECT COUNT(*) AS count, facultyRank
+        FROM analysis_summary_person_new
+        WHERE top5PercentileAll IS NOT NULL AND countAll > 4
+        GROUP BY facultyRank
+    ) x ON x.facultyRank = p.facultyRank
+    SET p.top5DenominatorAll = x.count;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top5DenominatorAll', 'INFO', v_rows, NULL);
+
+    -- top5RankAll: Rank within facultyRank by top5PercentileAll
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT personIdentifier,
+               RANK() OVER (PARTITION BY facultyRank ORDER BY top5PercentileAll DESC) AS personRank
+        FROM analysis_summary_person_new
+        WHERE countAll > 4
     ) x ON x.personIdentifier = p.personIdentifier
-    SET p.top10PercentileFirstSenior = x.pct,
-        p.top10RankFirstSenior = x.rank_count,
-        p.top10DenominatorFirstSenior = x.denominator;
+    SET p.top5RankAll = x.personRank;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top5RankAll', 'INFO', v_rows, NULL);
+
+    -- ========================================================================
+    -- 5b. TOP 10 PERCENTILE - ALL POSITIONS
+    -- ========================================================================
 
-    -- Top 5 percentile, first authored only
+    -- top10PercentileAll: Average of top 10 percentiles (requires countAll > 9)
     UPDATE analysis_summary_person_new p
     JOIN (
-        SELECT
-            a.personIdentifier,
-            ROUND(100 * SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct,
-            SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) AS rank_count,
-            COUNT(*) AS denominator
-        FROM analysis_summary_author_new a
-        JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid
-        WHERE percentileNIH IS NOT NULL
-          AND authorPosition = 'first'
-        GROUP BY a.personIdentifier
+        SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH
+        FROM (
+            SELECT s.personIdentifier, a1.pmid, a1.percentileNIH,
+                   RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank
+            FROM analysis_summary_person_new s
+            JOIN analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier
+            Join analysis_summary_article_new a1 ON a1.pmid = a.pmid
+            WHERE a1.percentileNIH IS NOT NULL AND s.countAll > 9
+        ) y
+        WHERE article_rank < 11
+        GROUP BY personIdentifier
     ) x ON x.personIdentifier = p.personIdentifier
-    SET p.top5PercentileFirst = x.pct,
-        p.top5RankFirst = x.rank_count,
-        p.top5DenominatorFirst = x.denominator;
+    SET p.top10PercentileAll = x.percentileNIH;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top10PercentileAll', 'INFO', v_rows, NULL);
 
-    -- Top 10 percentile, first authored only
+    -- top10DenominatorAll: Count of people in same facultyRank with valid top10PercentileAll
     UPDATE analysis_summary_person_new p
     JOIN (
-        SELECT
-            a.personIdentifier,
-            ROUND(100 * SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct,
-            SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) AS rank_count,
-            COUNT(*) AS denominator
-        FROM analysis_summary_author_new a
-        JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid
-        WHERE percentileNIH IS NOT NULL
-          AND authorPosition = 'first'
-        GROUP BY a.personIdentifier
+        SELECT COUNT(*) AS count, facultyRank
+        FROM analysis_summary_person_new
+        WHERE top10PercentileAll IS NOT NULL AND countAll > 9
+        GROUP BY facultyRank
+    ) x ON x.facultyRank = p.facultyRank
+    SET p.top10DenominatorAll = x.count;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top10DenominatorAll', 'INFO', v_rows, NULL);
+
+    -- top10RankAll: Rank within facultyRank by top10PercentileAll
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT personIdentifier,
+               RANK() OVER (PARTITION BY facultyRank ORDER BY top10PercentileAll DESC) AS personRank
+        FROM analysis_summary_person_new
+        WHERE countAll > 9
     ) x ON x.personIdentifier = p.personIdentifier
-    SET p.top10PercentileFirst = x.pct,
-        p.top10RankFirst = x.rank_count,
-        p.top10DenominatorFirst = x.denominator;
+    SET p.top10RankAll = x.personRank;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top10RankAll', 'INFO', v_rows, NULL);
+
+    -- ========================================================================
+    -- 5c. TOP 5 PERCENTILE - FIRST AUTHOR ONLY
+    -- ========================================================================
 
-    -- Top 5 percentile, last authored only
+    -- top5PercentileFirst: Average of top 5 percentiles for first-authored (requires countFirst > 4)
     UPDATE analysis_summary_person_new p
     JOIN (
-        SELECT
-            a.personIdentifier,
-            ROUND(100 * SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct,
-            SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) AS rank_count,
-            COUNT(*) AS denominator
-        FROM analysis_summary_author_new a
-        JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid
-        WHERE percentileNIH IS NOT NULL
-          AND authorPosition = 'last'
-        GROUP BY a.personIdentifier
+        SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH
+        FROM (
+            SELECT s.personIdentifier, a1.pmid, a1.percentileNIH,
+                   RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank
+            FROM analysis_summary_person_new s
+            Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier
+            Join analysis_summary_article_new a1 ON a1.pmid = a.pmid
+            WHERE a1.percentileNIH IS NOT NULL AND s.countFirst > 4
+              AND a.authorPosition = 'first'
+        ) y
+        WHERE article_rank < 6
+        GROUP BY personIdentifier
     ) x ON x.personIdentifier = p.personIdentifier
-    SET p.top5PercentileSenior = x.pct,
-        p.top5RankSenior = x.rank_count,
-        p.top5DenominatorSenior = x.denominator;
+    SET p.top5PercentileFirst = x.percentileNIH;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top5PercentileFirst', 'INFO', v_rows, NULL);
 
-    -- Top 10 percentile, last authored only
+    -- top5DenominatorFirst
     UPDATE analysis_summary_person_new p
     JOIN (
-        SELECT
-            a.personIdentifier,
-            ROUND(100 * SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct,
-            SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) AS rank_count,
-            COUNT(*) AS denominator
-        FROM analysis_summary_author_new a
-        JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid
-        WHERE percentileNIH IS NOT NULL
-          AND authorPosition = 'last'
-        GROUP BY a.personIdentifier
+        SELECT COUNT(*) AS count, facultyRank
+        FROM analysis_summary_person_new
+        WHERE top5PercentileFirst IS NOT NULL AND countFirst > 4
+        GROUP BY facultyRank
+    ) x ON x.facultyRank = p.facultyRank
+    SET p.top5DenominatorFirst = x.count;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top5DenominatorFirst', 'INFO', v_rows, NULL);
+
+    -- top5RankFirst
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT personIdentifier,
+               RANK() OVER (PARTITION BY facultyRank ORDER BY top5PercentileFirst DESC) AS personRank
+        FROM analysis_summary_person_new
+        WHERE countFirst > 4
     ) x ON x.personIdentifier = p.personIdentifier
-    SET p.top10PercentileSenior = x.pct,
-        p.top10RankSenior = x.rank_count,
-        p.top10DenominatorSenior = x.denominator;
+    SET p.top5RankFirst = x.personRank;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top5RankFirst', 'INFO', v_rows, NULL);
 
-    -- Top 5 percentile, all positions
+    -- ========================================================================
+    -- 5d. TOP 10 PERCENTILE - FIRST AUTHOR ONLY
+    -- ========================================================================
+
+    -- top10PercentileFirst: Average of top 10 percentiles for first-authored (requires countFirst > 9)
     UPDATE analysis_summary_person_new p
     JOIN (
-        SELECT
-            a.personIdentifier,
-            ROUND(100 * SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct,
-            SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) AS rank_count,
-            COUNT(*) AS denominator
-        FROM analysis_summary_author_new a
-        JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid
-        WHERE percentileNIH IS NOT NULL
-        GROUP BY a.personIdentifier
+        SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH
+        FROM (
+            SELECT s.personIdentifier, a1.pmid, a1.percentileNIH,
+                   RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank
+            FROM analysis_summary_person_new s
+            Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier
+            Join analysis_summary_article_new a1 ON a1.pmid = a.pmid
+            WHERE a1.percentileNIH IS NOT NULL AND s.countFirst > 9
+              AND a.authorPosition = 'first'
+        ) y
+        WHERE article_rank < 11
+        GROUP BY personIdentifier
     ) x ON x.personIdentifier = p.personIdentifier
-    SET p.top5PercentileAll = x.pct,
-        p.top5RankAll = x.rank_count,
-        p.top5DenominatorAll = x.denominator;
+    SET p.top10PercentileFirst = x.percentileNIH;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top10PercentileFirst', 'INFO', v_rows, NULL);
 
-    -- Top 10 percentile, all positions
+    -- top10DenominatorFirst
     UPDATE analysis_summary_person_new p
     JOIN (
-        SELECT
-            a.personIdentifier,
-            ROUND(100 * SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct,
-            SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) AS rank_count,
-            COUNT(*) AS denominator
-        FROM analysis_summary_author_new a
-        JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid
-        WHERE percentileNIH IS NOT NULL
-        GROUP BY a.personIdentifier
+        SELECT COUNT(*) AS count, facultyRank
+        FROM analysis_summary_person_new
+        WHERE top10PercentileFirst IS NOT NULL AND countFirst > 9
+        GROUP BY facultyRank
+    ) x ON x.facultyRank = p.facultyRank
+    SET p.top10DenominatorFirst = x.count;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top10DenominatorFirst', 'INFO', v_rows, NULL);
+
+    -- top10RankFirst
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT personIdentifier,
+               RANK() OVER (PARTITION BY facultyRank ORDER BY top10PercentileFirst DESC) AS personRank
+        FROM analysis_summary_person_new
+        WHERE countFirst > 9
+    ) x ON x.personIdentifier = p.personIdentifier
+    SET p.top10RankFirst = x.personRank;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top10RankFirst', 'INFO', v_rows, NULL);
+
+    -- ========================================================================
+    -- 5e. TOP 5 PERCENTILE - SENIOR/LAST AUTHOR ONLY
+    -- ========================================================================
+
+    -- top5PercentileSenior: Average of top 5 percentiles for last-authored (requires countSenior > 4)
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH
+        FROM (
+            SELECT s.personIdentifier, a1.pmid, a1.percentileNIH,
+                   RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank
+            FROM analysis_summary_person_new s
+            Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier
+            Join analysis_summary_article_new a1 ON a1.pmid = a.pmid
+            WHERE a1.percentileNIH IS NOT NULL AND s.countSenior > 4
+              AND a.authorPosition = 'last'
+        ) y
+        WHERE article_rank < 6
+        GROUP BY personIdentifier
+    ) x ON x.personIdentifier = p.personIdentifier
+    SET p.top5PercentileSenior = x.percentileNIH;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top5PercentileSenior', 'INFO', v_rows, NULL);
+
+    -- top5DenominatorSenior
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT COUNT(*) AS count, facultyRank
+        FROM analysis_summary_person_new
+        WHERE top5PercentileSenior IS NOT NULL AND countSenior > 4
+        GROUP BY facultyRank
+    ) x ON x.facultyRank = p.facultyRank
+    SET p.top5DenominatorSenior = x.count;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top5DenominatorSenior', 'INFO', v_rows, NULL);
+
+    -- top5RankSenior
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT personIdentifier,
+               RANK() OVER (PARTITION BY facultyRank ORDER BY top5PercentileSenior DESC) AS personRank
+        FROM analysis_summary_person_new
+        WHERE countSenior > 4
+    ) x ON x.personIdentifier = p.personIdentifier
+    SET p.top5RankSenior = x.personRank;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top5RankSenior', 'INFO', v_rows, NULL);
+
+    -- ========================================================================
+    -- 5f. TOP 10 PERCENTILE - SENIOR/LAST AUTHOR ONLY
+    -- ========================================================================
+
+    -- top10PercentileSenior: Average of top 10 percentiles for last-authored (requires countSenior > 9)
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH
+        FROM (
+            SELECT s.personIdentifier, a1.pmid, a1.percentileNIH,
+                   RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank
+            FROM analysis_summary_person_new s
+            Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier
+            Join analysis_summary_article_new a1 ON a1.pmid = a.pmid
+            WHERE a1.percentileNIH IS NOT NULL AND s.countSenior > 9
+              AND a.authorPosition = 'last'
+        ) y
+        WHERE article_rank < 11
+        GROUP BY personIdentifier
+    ) x ON x.personIdentifier = p.personIdentifier
+    SET p.top10PercentileSenior = x.percentileNIH;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top10PercentileSenior', 'INFO', v_rows, NULL);
+
+    -- top10DenominatorSenior
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT COUNT(*) AS count, facultyRank
+        FROM analysis_summary_person_new
+        WHERE top10PercentileSenior IS NOT NULL AND countSenior > 9
+        GROUP BY facultyRank
+    ) x ON x.facultyRank = p.facultyRank
+    SET p.top10DenominatorSenior = x.count;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top10DenominatorSenior', 'INFO', v_rows, NULL);
+
+    -- top10RankSenior
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT personIdentifier,
+               RANK() OVER (PARTITION BY facultyRank ORDER BY top10PercentileSenior DESC) AS personRank
+        FROM analysis_summary_person_new
+        WHERE countSenior > 9
+    ) x ON x.personIdentifier = p.personIdentifier
+    SET p.top10RankSenior = x.personRank;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top10RankSenior', 'INFO', v_rows, NULL);
+
+    -- ========================================================================
+    -- 5g. TOP 5 PERCENTILE - FIRST OR SENIOR (combined)
+    -- Note: countFirstSenior is computed inline since column doesn't exist
+    -- ========================================================================
+
+    -- top5PercentileFirstSenior: Average of top 5 percentiles for first/last authored
+    -- Requires at least 5 first+last authored articles with percentileNIH
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH
+        FROM (
+            SELECT s.personIdentifier, a1.pmid, a1.percentileNIH,
+                   RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank
+            FROM analysis_summary_person_new s
+            Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier
+            Join analysis_summary_article_new a1 ON a1.pmid = a.pmid
+            WHERE a1.percentileNIH IS NOT NULL
+              AND a.authorPosition IN ('first', 'last')
+              AND s.personIdentifier IN (
+                  -- Only include people with > 4 first/last articles
+                  SELECT s2.personIdentifier
+                  FROM analysis_summary_person_new s2
+                  Join analysis_summary_author_new a2 ON a2.personIdentifier = s2.personIdentifier
+                  Join analysis_summary_article_new a12 ON a12.pmid = a2.pmid
+                  WHERE a12.publicationTypeNIH = 'Research Article' AND a12.percentileNIH IS NOT NULL
+                    AND a2.authorPosition IN ('first', 'last')
+                  GROUP BY s2.personIdentifier
+                  HAVING COUNT(*) > 4
+              )
+        ) y
+        WHERE article_rank < 6
+        GROUP BY personIdentifier
+    ) x ON x.personIdentifier = p.personIdentifier
+    SET p.top5PercentileFirstSenior = x.percentileNIH;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top5PercentileFirstSenior', 'INFO', v_rows, NULL);
+
+    -- top5DenominatorFirstSenior
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT COUNT(*) AS count, facultyRank
+        FROM analysis_summary_person_new
+        WHERE top5PercentileFirstSenior IS NOT NULL
+        GROUP BY facultyRank
+    ) x ON x.facultyRank = p.facultyRank
+    SET p.top5DenominatorFirstSenior = x.count;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top5DenominatorFirstSenior', 'INFO', v_rows, NULL);
+
+    -- top5RankFirstSenior
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT personIdentifier,
+               RANK() OVER (PARTITION BY facultyRank ORDER BY top5PercentileFirstSenior DESC) AS personRank
+        FROM analysis_summary_person_new
+        WHERE top5PercentileFirstSenior IS NOT NULL
+    ) x ON x.personIdentifier = p.personIdentifier
+    SET p.top5RankFirstSenior = x.personRank;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top5RankFirstSenior', 'INFO', v_rows, NULL);
+
+    -- ========================================================================
+    -- 5h. TOP 10 PERCENTILE - FIRST OR SENIOR (combined)
+    -- ========================================================================
+
+    -- top10PercentileFirstSenior (requires > 9 first/last articles)
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH
+        FROM (
+            SELECT s.personIdentifier, a1.pmid, a1.percentileNIH,
+                   RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank
+            FROM analysis_summary_person_new s
+            Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier
+            Join analysis_summary_article_new a1 ON a1.pmid = a.pmid
+            WHERE a1.percentileNIH IS NOT NULL
+              AND a.authorPosition IN ('first', 'last')
+              AND s.personIdentifier IN (
+                  -- Only include people with > 9 first/last articles
+                  SELECT s2.personIdentifier
+                  FROM analysis_summary_person_new s2
+                  Join analysis_summary_author_new a2 ON a2.personIdentifier = s2.personIdentifier
+                  Join analysis_summary_article_new a12 ON a12.pmid = a2.pmid
+                  WHERE a12.publicationTypeNIH = 'Research Article' AND a12.percentileNIH IS NOT NULL
+                    AND a2.authorPosition IN ('first', 'last')
+                  GROUP BY s2.personIdentifier
+                  HAVING COUNT(*) > 9
+              )
+        ) y
+        WHERE article_rank < 11
+        GROUP BY personIdentifier
+    ) x ON x.personIdentifier = p.personIdentifier
+    SET p.top10PercentileFirstSenior = x.percentileNIH;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top10PercentileFirstSenior', 'INFO', v_rows, NULL);
+
+    -- top10DenominatorFirstSenior
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT COUNT(*) AS count, facultyRank
+        FROM analysis_summary_person_new
+        WHERE top10PercentileFirstSenior IS NOT NULL
+        GROUP BY facultyRank
+    ) x ON x.facultyRank = p.facultyRank
+    SET p.top10DenominatorFirstSenior = x.count;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top10DenominatorFirstSenior', 'INFO', v_rows, NULL);
+
+    -- top10RankFirstSenior
+    UPDATE analysis_summary_person_new p
+    JOIN (
+        SELECT personIdentifier,
+               RANK() OVER (PARTITION BY facultyRank ORDER BY top10PercentileFirstSenior DESC) AS personRank
+        FROM analysis_summary_person_new
+        WHERE top10PercentileFirstSenior IS NOT NULL
     ) x ON x.personIdentifier = p.personIdentifier
-    SET p.top10PercentileAll = x.pct,
-        p.top10RankAll = x.rank_count,
-        p.top10DenominatorAll = x.denominator;
+    SET p.top10RankFirstSenior = x.personRank;
+    SET v_rows = ROW_COUNT();
+    CALL log_progress(v_job_id, v_step, 'Updated top10RankFirstSenior', 'INFO', v_rows, NULL);
 
     IF v_error_occurred THEN
         CALL log_progress(v_job_id, v_step, 'Failed', 'ERROR', NULL, v_error_message);

From 6ad492ee2e73bba35dbe452e7beafcf369baec14 Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Tue, 14 Apr 2026 12:39:33 -0400
Subject: [PATCH 04/19] feat(schema): add 4 new Feature Generator fields

Capture new top-level and feedbackEvidence fields emitted by ReCiter:
  - datePublicationAddedToPMC (person_article, analysis_summary_article)
  - feedbackScoreTextSimilarity (person_article)
  - feedbackScoreJournalTitleSimilarity (person_article)
  - feedbackScoreBibliographicCoupling (person_article)

Schema, nightly SP (v2 + inline copy), legacy loose-SQL insert,
CSV transformer, and LOAD DATA column list updated in lockstep.

Adds idempotent migration setup/alter_add_feature_generator_fields_v1.1.sql
for applying to prod and dev DBs out-of-band. ALTER must run BEFORE
deploying the updated ETL image, otherwise LOAD DATA fails on unknown
columns.
---
 ...lter_add_feature_generator_fields_v1.1.sql | 80 +++++++++++++++++++
 setup/createDatabaseTableReciterDb.sql        |  5 ++
 setup/createEventsProceduresReciterDb.sql     | 12 +--
 setup/populateAnalysisSummaryTables_v2.sql    |  4 +-
 update/dataTransformer.py                     | 19 +++--
 update/updateReciterDB.py                     |  4 +-
 6 files changed, 112 insertions(+), 12 deletions(-)
 create mode 100644 setup/alter_add_feature_generator_fields_v1.1.sql

diff --git a/setup/alter_add_feature_generator_fields_v1.1.sql b/setup/alter_add_feature_generator_fields_v1.1.sql
new file mode 100644
index 0000000..ab5733a
--- /dev/null
+++ b/setup/alter_add_feature_generator_fields_v1.1.sql
@@ -0,0 +1,80 @@
+-- =============================================================================
+-- Migration: Add 4 new Feature Generator fields (v1.1)
+-- =============================================================================
+-- Adds columns introduced by ReCiter Feature Generator:
+--   - datePublicationAddedToPMC          (top-level article field)
+--   - feedbackScoreTextSimilarity        (evidence.feedbackEvidence)
+--   - feedbackScoreJournalTitleSimilarity (evidence.feedbackEvidence)
+--   - feedbackScoreBibliographicCoupling  (evidence.feedbackEvidence)
+--
+-- Safe to run on prod and dev. Uses IF NOT EXISTS-style guards via
+-- information_schema check (no-op on re-run).
+--
+-- Run BEFORE deploying the updated Python ETL, otherwise LOAD DATA INFILE
+-- will fail with "Unknown column" on the 4 new headers.
+-- =============================================================================
+
+-- -----------------------------------------------------------------------------
+-- person_article: + datePublicationAddedToPMC + 3 feedback scores
+-- -----------------------------------------------------------------------------
+
+SET @db = DATABASE();
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'person_article'
+       AND column_name = 'datePublicationAddedToPMC') = 0,
+    'ALTER TABLE person_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL AFTER `datePublicationAddedToEntrez`',
+    'SELECT ''person_article.datePublicationAddedToPMC already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'person_article'
+       AND column_name = 'feedbackScoreTextSimilarity') = 0,
+    'ALTER TABLE person_article ADD COLUMN `feedbackScoreTextSimilarity` float DEFAULT NULL AFTER `feedbackScoreYear`',
+    'SELECT ''person_article.feedbackScoreTextSimilarity already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'person_article'
+       AND column_name = 'feedbackScoreJournalTitleSimilarity') = 0,
+    'ALTER TABLE person_article ADD COLUMN `feedbackScoreJournalTitleSimilarity` float DEFAULT NULL AFTER `feedbackScoreTextSimilarity`',
+    'SELECT ''person_article.feedbackScoreJournalTitleSimilarity already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'person_article'
+       AND column_name = 'feedbackScoreBibliographicCoupling') = 0,
+    'ALTER TABLE person_article ADD COLUMN `feedbackScoreBibliographicCoupling` float DEFAULT NULL AFTER `feedbackScoreJournalTitleSimilarity`',
+    'SELECT ''person_article.feedbackScoreBibliographicCoupling already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+-- -----------------------------------------------------------------------------
+-- analysis_summary_article: + datePublicationAddedToPMC
+-- (feedback scores NOT carried into summary — per-person-article only)
+-- -----------------------------------------------------------------------------
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'analysis_summary_article'
+       AND column_name = 'datePublicationAddedToPMC') = 0,
+    'ALTER TABLE analysis_summary_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL AFTER `datePublicationAddedToEntrez`',
+    'SELECT ''analysis_summary_article.datePublicationAddedToPMC already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+-- -----------------------------------------------------------------------------
+-- Verification
+-- -----------------------------------------------------------------------------
+
+SELECT table_name, column_name, data_type, is_nullable
+FROM information_schema.columns
+WHERE table_schema = DATABASE()
+  AND column_name IN (
+    'datePublicationAddedToPMC',
+    'feedbackScoreTextSimilarity',
+    'feedbackScoreJournalTitleSimilarity',
+    'feedbackScoreBibliographicCoupling')
+ORDER BY table_name, ordinal_position;
diff --git a/setup/createDatabaseTableReciterDb.sql b/setup/createDatabaseTableReciterDb.sql
index 13d04d8..cdedbd7 100644
--- a/setup/createDatabaseTableReciterDb.sql
+++ b/setup/createDatabaseTableReciterDb.sql
@@ -311,6 +311,7 @@ CREATE TABLE IF NOT EXISTS `analysis_summary_article` (
   `publicationDateDisplay` varchar(200) DEFAULT NULL,
   `publicationDateStandardized` varchar(128) DEFAULT NULL,
   `datePublicationAddedToEntrez` varchar(128) DEFAULT NULL,
+  `datePublicationAddedToPMC` varchar(128) DEFAULT NULL,
   `articleTitle` varchar(1000) DEFAULT NULL,
   `articleTitleRTF` varchar(2000) DEFAULT NULL,
   `publicationTypeCanonical` varchar(128) DEFAULT NULL,
@@ -654,6 +655,7 @@ CREATE TABLE IF NOT EXISTS `person_article` (
   `scopusNonTargetAuthorInstitutionalAffiliationSource` varchar(128) DEFAULT NULL,
   `scopusNonTargetAuthorInstitutionalAffiliationScore` float DEFAULT 0,
   `datePublicationAddedToEntrez` varchar(128) DEFAULT NULL,
+  `datePublicationAddedToPMC` varchar(128) DEFAULT NULL,
   `doi` varchar(128) DEFAULT NULL,
   `issn` varchar(128) DEFAULT NULL,
   `issue` varchar(500) DEFAULT 'NULL',
@@ -673,6 +675,9 @@ CREATE TABLE IF NOT EXISTS `person_article` (
   `feedbackScoreOrganization` float DEFAULT NULL,
   `feedbackScoreTargetAuthorName` float DEFAULT NULL,
   `feedbackScoreYear` float DEFAULT NULL,
+  `feedbackScoreTextSimilarity` float DEFAULT NULL,
+  `feedbackScoreJournalTitleSimilarity` float DEFAULT NULL,
+  `feedbackScoreBibliographicCoupling` float DEFAULT NULL,
   `totalArticleScoreStandardized` int(11) DEFAULT NULL,
   `totalArticleScoreNonStandardized` float DEFAULT NULL,
   `targetAuthorCount` int(11) DEFAULT NULL,
diff --git a/setup/createEventsProceduresReciterDb.sql b/setup/createEventsProceduresReciterDb.sql
index 7891ea1..533878b 100644
--- a/setup/createEventsProceduresReciterDb.sql
+++ b/setup/createEventsProceduresReciterDb.sql
@@ -2455,10 +2455,10 @@ order by pmid desc, rank asc;
 
 #### 3. Populate "analysis_summary_article" table with articles ####
 
-insert into analysis_summary_article (pmid, pmcid, publicationTypeCanonical, articleYear, publicationDateStandardized, publicationDateDisplay, datePublicationAddedToEntrez, articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, citationCountScopus)
-select distinct 
-pmid, max(pmcid), publicationTypeCanonical, articleYear, min(publicationDateStandardized), publicationDateDisplay, datePublicationAddedToEntrez, articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, max(timesCited)
-from person_article 
+insert into analysis_summary_article (pmid, pmcid, publicationTypeCanonical, articleYear, publicationDateStandardized, publicationDateDisplay, datePublicationAddedToEntrez, datePublicationAddedToPMC, articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, citationCountScopus)
+select distinct
+pmid, max(pmcid), publicationTypeCanonical, articleYear, min(publicationDateStandardized), publicationDateDisplay, datePublicationAddedToEntrez, max(datePublicationAddedToPMC), articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, max(timesCited)
+from person_article
 where userAssertion = 'ACCEPTED'
 group by pmid
 order by datePublicationAddedToEntrez desc;
@@ -3584,7 +3584,8 @@ proc_main: BEGIN
     INSERT INTO analysis_summary_article_new (
         pmid, pmcid, publicationTypeCanonical, articleYear,
         publicationDateStandardized, publicationDateDisplay,
-        datePublicationAddedToEntrez, articleTitle, journalTitleVerbose,
+        datePublicationAddedToEntrez, datePublicationAddedToPMC,
+        articleTitle, journalTitleVerbose,
         issn, doi, issue, volume, pages, citationCountScopus
     )
     SELECT DISTINCT
@@ -3595,6 +3596,7 @@ proc_main: BEGIN
         MIN(publicationDateStandardized),
         publicationDateDisplay,
         datePublicationAddedToEntrez,
+        MAX(datePublicationAddedToPMC),
         articleTitle,
         journalTitleVerbose,
         issn,
diff --git a/setup/populateAnalysisSummaryTables_v2.sql b/setup/populateAnalysisSummaryTables_v2.sql
index b8807e5..08b3e63 100644
--- a/setup/populateAnalysisSummaryTables_v2.sql
+++ b/setup/populateAnalysisSummaryTables_v2.sql
@@ -420,7 +420,8 @@ proc_main: BEGIN
     INSERT INTO analysis_summary_article_new (
         pmid, pmcid, publicationTypeCanonical, articleYear,
         publicationDateStandardized, publicationDateDisplay,
-        datePublicationAddedToEntrez, articleTitle, journalTitleVerbose,
+        datePublicationAddedToEntrez, datePublicationAddedToPMC,
+        articleTitle, journalTitleVerbose,
         issn, doi, issue, volume, pages, citationCountScopus
     )
     SELECT DISTINCT
@@ -431,6 +432,7 @@ proc_main: BEGIN
         MIN(publicationDateStandardized),
         publicationDateDisplay,
         datePublicationAddedToEntrez,
+        MAX(datePublicationAddedToPMC),
         articleTitle,
         journalTitleVerbose,
         issn,
diff --git a/update/dataTransformer.py b/update/dataTransformer.py
index e9328b8..f89d502 100644
--- a/update/dataTransformer.py
+++ b/update/dataTransformer.py
@@ -227,13 +227,15 @@ def process_person_article(items, output_path):
         "scopusNonTargetAuthorInstitutionalAffiliationSource",
         "scopusNonTargetAuthorInstitutionalAffiliationScore",
         
-        "datePublicationAddedToEntrez", "doi",
+        "datePublicationAddedToEntrez", "datePublicationAddedToPMC", "doi",
         "issn", "issue", "journalTitleISOabbreviation", "pages", "timesCited", "volume",
-        
+
         "feedbackScoreCites", "feedbackScoreCoAuthorName", "feedbackScoreEmail",
         "feedbackScoreInstitution", "feedbackScoreJournal", "feedbackScoreJournalSubField",
         "feedbackScoreKeyword", "feedbackScoreOrcid", "feedbackScoreOrcidCoAuthor",
         "feedbackScoreOrganization", "feedbackScoreTargetAuthorName", "feedbackScoreYear",
+        "feedbackScoreTextSimilarity", "feedbackScoreJournalTitleSimilarity",
+        "feedbackScoreBibliographicCoupling",
         "totalArticleScoreStandardized", "totalArticleScoreNonStandardized"
     ]
 
@@ -398,6 +400,7 @@ def process_person_article(items, output_path):
 
                         # Additional fields
                         date_publication_added_to_entrez = sanitize_field(article.get('datePublicationAddedToEntrez', ''))
+                        date_publication_added_to_pmc = sanitize_field(article.get('datePublicationAddedToPMC', ''))
                         doi = sanitize_field(article.get('doi', ''))
                         issn_list = article.get('issn', [])
                         issn = ''
@@ -432,11 +435,13 @@ def process_person_article(items, output_path):
                                     'feedbackScoreCites', 'feedbackScoreCoAuthorName', 'feedbackScoreEmail',
                                     'feedbackScoreInstitution', 'feedbackScoreJournal', 'feedbackScoreJournalSubField',
                                     'feedbackScoreKeyword', 'feedbackScoreOrcid', 'feedbackScoreOrcidCoAuthor',
-                                    'feedbackScoreOrganization', 'feedbackScoreTargetAuthorName', 'feedbackScoreYear'
+                                    'feedbackScoreOrganization', 'feedbackScoreTargetAuthorName', 'feedbackScoreYear',
+                                    'feedbackScoreTextSimilarity', 'feedbackScoreJournalTitleSimilarity',
+                                    'feedbackScoreBibliographicCoupling'
                                 ]
                             ]
                         else:
-                            feedback_scores = [''] * 12  # Assuming 12 feedback scores
+                            feedback_scores = [''] * 15  # Assuming 15 feedback scores
                         total_article_score_standardized = sanitize_field(article.get('totalArticleScoreStandardized', ''))
                         total_article_score_non_standardized = sanitize_field(article.get('totalArticleScoreNonStandardized', ''))                            
 
@@ -508,6 +513,7 @@ def process_person_article(items, output_path):
                             "scopusNonTargetAuthorInstitutionalAffiliationScore": scopus_non_target_author_institutional_affiliation_score,
                                                        
                             "datePublicationAddedToEntrez": date_publication_added_to_entrez,
+                            "datePublicationAddedToPMC": date_publication_added_to_pmc,
                             "doi": doi,
                             "issn": issn,
                             "issue": issue,
@@ -529,7 +535,10 @@ def process_person_article(items, output_path):
                             "feedbackScoreOrganization": feedback_scores[9],
                             "feedbackScoreTargetAuthorName": feedback_scores[10],
                             "feedbackScoreYear": feedback_scores[11],
-                            
+                            "feedbackScoreTextSimilarity": feedback_scores[12],
+                            "feedbackScoreJournalTitleSimilarity": feedback_scores[13],
+                            "feedbackScoreBibliographicCoupling": feedback_scores[14],
+
                             "totalArticleScoreStandardized": total_article_score_standardized,
                             "totalArticleScoreNonStandardized": total_article_score_non_standardized
                         }
diff --git a/update/updateReciterDB.py b/update/updateReciterDB.py
index 6aa3a7f..94d0e55 100644
--- a/update/updateReciterDB.py
+++ b/update/updateReciterDB.py
@@ -303,12 +303,14 @@ def main(truncate_tables=True, skip_person_temp=False):
                 "pubmedTargetAuthorInstitutionalAffiliationMatchTypeScore",
                 "scopusNonTargetAuthorInstitutionalAffiliationSource",
                 "scopusNonTargetAuthorInstitutionalAffiliationScore",
-                "datePublicationAddedToEntrez", "doi",
+                "datePublicationAddedToEntrez", "datePublicationAddedToPMC", "doi",
                 "issn", "issue", "journalTitleISOabbreviation", "pages", "timesCited", "volume",
                 "feedbackScoreCites", "feedbackScoreCoAuthorName", "feedbackScoreEmail",
                 "feedbackScoreInstitution", "feedbackScoreJournal", "feedbackScoreJournalSubField",
                 "feedbackScoreKeyword", "feedbackScoreOrcid", "feedbackScoreOrcidCoAuthor",
                 "feedbackScoreOrganization", "feedbackScoreTargetAuthorName", "feedbackScoreYear",
+                "feedbackScoreTextSimilarity", "feedbackScoreJournalTitleSimilarity",
+                "feedbackScoreBibliographicCoupling",
                 "totalArticleScoreStandardized", "totalArticleScoreNonStandardized"
             ],
             'person_article_author': [

From 9bd507cfb5b5253c601715d7dac22678ecc76069 Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Tue, 14 Apr 2026 12:49:04 -0400
Subject: [PATCH 05/19] fix(migration): drop AFTER clauses to enable
 ALGORITHM=INSTANT

Applied to prod + dev on MariaDB 11.4 / 10.6 in ~25s (network-bound).
AFTER placement forces INPLACE algorithm with metadata lock; appending
at end allows INSTANT (no table rewrite, no lock hold).
---
 setup/alter_add_feature_generator_fields_v1.1.sql | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/setup/alter_add_feature_generator_fields_v1.1.sql b/setup/alter_add_feature_generator_fields_v1.1.sql
index ab5733a..fb4c8b7 100644
--- a/setup/alter_add_feature_generator_fields_v1.1.sql
+++ b/setup/alter_add_feature_generator_fields_v1.1.sql
@@ -24,7 +24,7 @@ SET @sql = (SELECT IF(
     (SELECT COUNT(*) FROM information_schema.columns
      WHERE table_schema = @db AND table_name = 'person_article'
        AND column_name = 'datePublicationAddedToPMC') = 0,
-    'ALTER TABLE person_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL AFTER `datePublicationAddedToEntrez`',
+    'ALTER TABLE person_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL',
     'SELECT ''person_article.datePublicationAddedToPMC already exists'''));
 PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
 
@@ -32,7 +32,7 @@ SET @sql = (SELECT IF(
     (SELECT COUNT(*) FROM information_schema.columns
      WHERE table_schema = @db AND table_name = 'person_article'
        AND column_name = 'feedbackScoreTextSimilarity') = 0,
-    'ALTER TABLE person_article ADD COLUMN `feedbackScoreTextSimilarity` float DEFAULT NULL AFTER `feedbackScoreYear`',
+    'ALTER TABLE person_article ADD COLUMN `feedbackScoreTextSimilarity` float DEFAULT NULL',
     'SELECT ''person_article.feedbackScoreTextSimilarity already exists'''));
 PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
 
@@ -40,7 +40,7 @@ SET @sql = (SELECT IF(
     (SELECT COUNT(*) FROM information_schema.columns
      WHERE table_schema = @db AND table_name = 'person_article'
        AND column_name = 'feedbackScoreJournalTitleSimilarity') = 0,
-    'ALTER TABLE person_article ADD COLUMN `feedbackScoreJournalTitleSimilarity` float DEFAULT NULL AFTER `feedbackScoreTextSimilarity`',
+    'ALTER TABLE person_article ADD COLUMN `feedbackScoreJournalTitleSimilarity` float DEFAULT NULL',
     'SELECT ''person_article.feedbackScoreJournalTitleSimilarity already exists'''));
 PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
 
@@ -48,7 +48,7 @@ SET @sql = (SELECT IF(
     (SELECT COUNT(*) FROM information_schema.columns
      WHERE table_schema = @db AND table_name = 'person_article'
        AND column_name = 'feedbackScoreBibliographicCoupling') = 0,
-    'ALTER TABLE person_article ADD COLUMN `feedbackScoreBibliographicCoupling` float DEFAULT NULL AFTER `feedbackScoreJournalTitleSimilarity`',
+    'ALTER TABLE person_article ADD COLUMN `feedbackScoreBibliographicCoupling` float DEFAULT NULL',
     'SELECT ''person_article.feedbackScoreBibliographicCoupling already exists'''));
 PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
 
@@ -61,7 +61,7 @@ SET @sql = (SELECT IF(
     (SELECT COUNT(*) FROM information_schema.columns
      WHERE table_schema = @db AND table_name = 'analysis_summary_article'
        AND column_name = 'datePublicationAddedToPMC') = 0,
-    'ALTER TABLE analysis_summary_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL AFTER `datePublicationAddedToEntrez`',
+    'ALTER TABLE analysis_summary_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL',
     'SELECT ''analysis_summary_article.datePublicationAddedToPMC already exists'''));
 PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
 

From 1ad2244bcd0af0eb713a9294d1dce59ed2d43f52 Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Wed, 6 May 2026 13:06:30 -0400
Subject: [PATCH 06/19] fix(SP): fall back to publicationDateStandardized when
 articleYear is 0

The v2 populateAnalysisSummaryTables SP was selecting articleYear directly
from person_article without a fallback, leaving 74% of analysis_summary_article
rows with articleYear = 0. The legacy SP had a post-INSERT UPDATE step that
derived the year from publicationDateStandardized in those cases, which the
v2 rewrite dropped.

Push the fallback into the SELECT itself:
IF(articleYear != 0, articleYear, LEFT(publicationDateStandardized, 4))

Verified post-deployment: 0 rows with articleYear = 0 in prod and dev.
---
 setup/createEventsProceduresReciterDb.sql  | 2 +-
 setup/populateAnalysisSummaryTables_v2.sql | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup/createEventsProceduresReciterDb.sql b/setup/createEventsProceduresReciterDb.sql
index 533878b..c8a05f5 100644
--- a/setup/createEventsProceduresReciterDb.sql
+++ b/setup/createEventsProceduresReciterDb.sql
@@ -3592,7 +3592,7 @@ proc_main: BEGIN
         pmid,
         MAX(pmcid),
         publicationTypeCanonical,
-        articleYear,
+        IF(articleYear != 0, articleYear, LEFT(publicationDateStandardized, 4)),
         MIN(publicationDateStandardized),
         publicationDateDisplay,
         datePublicationAddedToEntrez,
diff --git a/setup/populateAnalysisSummaryTables_v2.sql b/setup/populateAnalysisSummaryTables_v2.sql
index 08b3e63..e2a6e3f 100644
--- a/setup/populateAnalysisSummaryTables_v2.sql
+++ b/setup/populateAnalysisSummaryTables_v2.sql
@@ -428,7 +428,7 @@ proc_main: BEGIN
         pmid,
         MAX(pmcid),
         publicationTypeCanonical,
-        articleYear,
+        IF(articleYear != 0, articleYear, LEFT(publicationDateStandardized, 4)),
         MIN(publicationDateStandardized),
         publicationDateDisplay,
         datePublicationAddedToEntrez,

From 9c215fd7a304920a639ce6caa2912b3be76d60d6 Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Fri, 15 May 2026 18:43:12 -0400
Subject: [PATCH 07/19] fix(retrieveNIH): correct column inversion in
 analysis_nih_cites load

The iCite API returns two arrays per record:
  - cited_by: PMIDs that cite the queried article
  - references: PMIDs the queried article cites

The CSV write for cited_by entries used [cited_by_elem, queried_pmid],
but the LOAD DATA columns are (cited_pmid, citing_pmid). Since cited_by
elements are the CITING articles (not the cited), this stored every
cited_by edge with the cited/citing columns swapped. The references
loop was already correct. Same inversion affected analysis_nih_cites_clin.

Effect: queries like
  SELECT COUNT(*) FROM analysis_nih_cites WHERE cited_pmid = <pmid>
returned only the rows sourced from other queried PMIDs' references
arrays, missing the (typically much larger) cited_by set. Example:
PMID 32432483 has iCite citation_count=192 but only 19 rows surfaced
because just 19 WCM-tracked papers happened to reference it.

After this change, on the next nightly run the table swap will load
analysis_nih_cites with semantically correct (cited_pmid, citing_pmid)
columns, matching the join used by downstream consumers
(Scholars-Profile-System/lib/api/publication-detail.ts).
---
 update/retrieveNIH.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/update/retrieveNIH.py b/update/retrieveNIH.py
index 97b40bd..9b39d72 100644
--- a/update/retrieveNIH.py
+++ b/update/retrieveNIH.py
@@ -138,22 +138,23 @@ def write_records_to_csv(records, csv_files):
             nih_writer.writerow(nih_record)
             nih_count += 1
 
-            citing_pmid = get_dict_value(record, "pmid")
+            queried_pmid = get_dict_value(record, "pmid")
 
-            # Write to analysis_nih_cites
+            # iCite "cited_by" = articles that cite queried_pmid; "references" = articles queried_pmid cites.
+            # CSV column order matches LOAD DATA columns: (cited_pmid, citing_pmid).
             if record.get("cited_by"):
-                for cited_by in record["cited_by"]:
-                    cites_writer.writerow([cited_by, citing_pmid])
+                for citing in record["cited_by"]:
+                    cites_writer.writerow([queried_pmid, citing])
                     cites_count += 1
             if record.get("references"):
-                for ref in record["references"]:
-                    cites_writer.writerow([ref, citing_pmid])
+                for cited in record["references"]:
+                    cites_writer.writerow([cited, queried_pmid])
                     cites_count += 1
 
             # Write to analysis_nih_cites_clin
             if record.get("cited_by_clin"):
-                for cited_by_clin in record["cited_by_clin"]:
-                    cites_clin_writer.writerow([cited_by_clin, citing_pmid])
+                for citing_clin in record["cited_by_clin"]:
+                    cites_clin_writer.writerow([queried_pmid, citing_clin])
                     cites_clin_count +=1
 
         except Exception as e:

From 810bc177e5125b89f2d7e16fdd20a6ebef68491a Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Sat, 16 May 2026 19:43:46 -0400
Subject: [PATCH 08/19] fix(abstractImport): replace CSV bulk-load that
 silently dropped rows

abstractImport.py wrote abstracts to a CSV with csv.writer, then bulk-loaded
it with LOAD DATA LOCAL INFILE. The two are incompatible: csv.writer emits
\r\n line endings and doubled-quote escaping (""), while the LOAD used
LINES TERMINATED BY '\n' and ENCLOSED BY '"' with backslash escaping. Any
abstract containing a double quote desynced MySQL's row parser, so ~99.9% of
every file was dropped with no error raised (~43 of ~72,000 rows loaded).
The unresolved PMIDs were re-selected every cycle and the unbounded
while-True loop ran until the 15,000s pipeline timeout, restart-looping
indefinitely and blocking the nightly CronJob.

- Replace CSV + LOAD DATA with a parameterized, batched executemany INSERT
  so abstract text is stored verbatim regardless of content.
- Bound the fetch/insert loop: stop on no progress, cap at 25 cycles, so
  unresolvable PMIDs can no longer hang the pipeline.
- Retry batch_get_item UnprocessedKeys with backoff instead of dropping
  throttled keys.
- Connect with charset=utf8mb4; drop the now-unused local_infile flag.
- Add a --dry-run mode that verifies the fetch/insert path against a
  TEMPORARY table without touching reporting_abstracts.
---
 update/abstractImport.py | 338 +++++++++++++++++++++++++++------------
 1 file changed, 236 insertions(+), 102 deletions(-)

diff --git a/update/abstractImport.py b/update/abstractImport.py
index 867b538..8057b24 100644
--- a/update/abstractImport.py
+++ b/update/abstractImport.py
@@ -1,10 +1,10 @@
 # abstractImport.py
 
 import boto3
-import csv
 import logging
 import pymysql.cursors
 import pymysql.err
+import random
 import sys
 import time
 import os
@@ -20,6 +20,10 @@
 )
 logger = logging.getLogger(__name__)
 
+# Quiet botocore's per-call credential/endpoint chatter so pipeline logs stay readable.
+logging.getLogger("botocore").setLevel(logging.WARNING)
+logging.getLogger("boto3").setLevel(logging.WARNING)
+
 # ------------------------------------------------------------------------------
 # Environment Variables
 # ------------------------------------------------------------------------------
@@ -28,15 +32,31 @@
 DB_HOST = os.getenv("DB_HOST")
 DB_NAME = os.getenv("DB_NAME")
 
-# DynamoDB concurrency settings
-CHUNK_SIZE = 100       # Max items per batch_get_item call
-MAX_WORKERS = 5        # Number of threads for parallel fetching
+# ------------------------------------------------------------------------------
+# Settings
+# ------------------------------------------------------------------------------
+# DynamoDB fetch
+CHUNK_SIZE = 100              # Max keys per batch_get_item call (DynamoDB hard limit)
+MAX_WORKERS = 5               # Threads for parallel fetching
+MAX_UNPROCESSED_RETRIES = 8   # Backoff retries for keys DynamoDB reports as unprocessed
+
+# Insert
+INSERT_BATCH_SIZE = 200       # Rows per executemany batch (kept well under max_allowed_packet)
+
+# Loop safety
+MAX_CYCLES = 25               # Hard cap on fetch/insert cycles; a healthy run needs 1-2
+
+# Dry run
+DRY_RUN = "--dry-run" in sys.argv
+DRY_RUN_SAMPLE = 500          # PMIDs processed when --dry-run is passed
+DRY_RUN_TABLE = "reporting_abstracts_dryrun"
+
 
 # ------------------------------------------------------------------------------
 # Database Connection
 # ------------------------------------------------------------------------------
 def connect_mysql_server(db_user, db_pass, db_host, db_name):
-    """Function to connect to MySQL database"""
+    """Connect to the MariaDB database."""
     try:
         mysql_db = pymysql.connect(
             user=db_user,
@@ -44,7 +64,7 @@ def connect_mysql_server(db_user, db_pass, db_host, db_name):
             database=db_name,
             host=db_host,
             autocommit=True,
-            local_infile=True,
+            charset="utf8mb4",
             cursorclass=pymysql.cursors.DictCursor
         )
         logger.info(f"Connected to database server: {db_host}, database: {db_name}, user: {db_user}")
@@ -53,13 +73,14 @@ def connect_mysql_server(db_user, db_pass, db_host, db_name):
         logger.error(f"{time.ctime()} -- Error connecting to the database: {err}")
         sys.exit(1)
 
+
 # ------------------------------------------------------------------------------
 # Fetch All Missing PMIDs
 # ------------------------------------------------------------------------------
 def fetch_missing_pmids(mysql_conn):
     """
-    Returns a list of all PMIDs that exist in analysis_summary_article
-    but do NOT exist in reporting_abstracts.
+    Returns every PMID that exists in analysis_summary_article but has no
+    matching row in reporting_abstracts.
     """
     sql = """
         SELECT DISTINCT p.pmid AS pmid
@@ -69,16 +90,17 @@ def fetch_missing_pmids(mysql_conn):
     """
     with mysql_conn.cursor() as cursor:
         cursor.execute(sql)
-        rows = cursor.fetchall()
-        return [row["pmid"] for row in rows]
+        return [row["pmid"] for row in cursor.fetchall()]
+
 
 # ------------------------------------------------------------------------------
 # Extract Abstract Text
 # ------------------------------------------------------------------------------
 def get_abstract(item):
     """
-    Extracts the abstract text from a DynamoDB item representing a PubMed article.
-    Handles labeled abstract segments if present.
+    Extracts the abstract text from a DynamoDB item representing a PubMed
+    article. Handles labeled abstract segments. Returns "" when no abstract
+    is present.
     """
     medline_citation = item.get("pubmedarticle", {}).get("medlinecitation")
     if not medline_citation:
@@ -102,125 +124,237 @@ def get_abstract(item):
 
     return " ".join(abstract_texts) if abstract_texts else ""
 
+
 # ------------------------------------------------------------------------------
-# Batch Fetch Abstracts from DynamoDB
+# Fetch Abstracts from DynamoDB
 # ------------------------------------------------------------------------------
 def fetch_abstracts_for_chunk(chunk_pmids):
     """
-    Performs a single batch_get_item call for the given chunk of PMIDs.
-    Returns a list of (pmid, abstract_text) pairs.
+    Fetches one chunk of PMIDs from DynamoDB via batch_get_item. Any keys that
+    DynamoDB reports as unprocessed (throttling) are retried with exponential
+    backoff so they are not silently lost. Returns (pmid, abstract) pairs.
     """
-    dynamodb = boto3.resource("dynamodb")
-    client = dynamodb.meta.client
+    client = boto3.resource("dynamodb").meta.client
+
+    request_keys = [{"pmid": pmid} for pmid in chunk_pmids]
+    results = []
+    attempt = 0
 
-    # Prepare Keys for batch_get_item
-    keys = [{"pmid": pmid} for pmid in chunk_pmids]
+    while request_keys:
+        response = client.batch_get_item(
+            RequestItems={"PubMedArticle": {"Keys": request_keys}}
+        )
 
-    # Perform batch_get_item
-    response = client.batch_get_item(
-        RequestItems={
-            "PubMedArticle": {"Keys": keys}
-        }
-    )
+        for item in response["Responses"].get("PubMedArticle", []):
+            pmid = item.get("pmid")
+            if pmid is not None:
+                results.append((pmid, get_abstract(item)))
 
-    items = response["Responses"].get("PubMedArticle", [])
-    results = []
-    for item in items:
-        pmid = item.get("pmid")
-        if pmid:
-            abstract_text = get_abstract(item)
-            results.append((pmid, abstract_text))
+        request_keys = (
+            response.get("UnprocessedKeys", {})
+            .get("PubMedArticle", {})
+            .get("Keys", [])
+        )
+        if request_keys:
+            attempt += 1
+            if attempt > MAX_UNPROCESSED_RETRIES:
+                logger.warning(
+                    f"{len(request_keys)} key(s) still unprocessed after "
+                    f"{MAX_UNPROCESSED_RETRIES} retries; skipping this chunk's remainder."
+                )
+                break
+            time.sleep(min(0.1 * (2 ** attempt), 5.0))
 
     return results
 
+
+def fetch_all_abstracts(pmids):
+    """Fetches abstracts for all given PMIDs from DynamoDB in parallel."""
+    chunks = [pmids[i:i + CHUNK_SIZE] for i in range(0, len(pmids), CHUNK_SIZE)]
+    logger.info(f"Created {len(chunks)} chunk(s). Each chunk up to {CHUNK_SIZE} PMIDs.")
+
+    all_results = []
+    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+        futures = {executor.submit(fetch_abstracts_for_chunk, c): c for c in chunks}
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                all_results.extend(future.result())
+            except Exception as e:
+                logger.exception(f"Error fetching chunk: {e}")
+    return all_results
+
+
 # ------------------------------------------------------------------------------
-# Bulk-Load a Single CSV into reporting_abstracts
+# Insert Abstracts
 # ------------------------------------------------------------------------------
-def load_csv_into_reporting_abstracts(mysql_conn, csv_path):
+def insert_abstracts(mysql_conn, results, target_table="reporting_abstracts"):
+    """
+    Inserts (pmid, abstract) pairs with a parameterized, batched INSERT.
+
+    pymysql binds every value as a query parameter, so abstracts containing
+    double quotes, tabs, newlines or backslashes are stored verbatim. The
+    previous CSV + LOAD DATA INFILE path could not parse such content and
+    silently dropped the affected rows.
+    """
+    if not results:
+        logger.info("No abstracts to insert.")
+        return 0
+
+    insert_sql = f"INSERT INTO {target_table} (pmid, abstract) VALUES (%s, %s)"
+    inserted = 0
     with mysql_conn.cursor() as cursor:
-        cwd = os.getcwd()
-        full_csv_path = os.path.join(cwd, csv_path).replace("\\", "/")  # Ensure correct path format
-
-        load_query = (
-            "LOAD DATA LOCAL INFILE '{path}' "
-            "INTO TABLE reporting_abstracts "
-            "FIELDS TERMINATED BY '\t' ENCLOSED BY '\"' "
-            "LINES TERMINATED BY '\n' "
-            "IGNORE 1 LINES (pmid, abstract);"
-        ).format(path=full_csv_path)
-
-        cursor.execute(load_query)
-        logger.info(f"{time.ctime()} -- {csv_path} loaded into reporting_abstracts.")
-
-        update_query = (
-            "UPDATE reporting_abstracts "
-            "SET abstractVarchar = CAST(abstract AS CHAR(15000)) "
-            "WHERE abstractVarchar IS NULL;"
+        for i in range(0, len(results), INSERT_BATCH_SIZE):
+            batch = results[i:i + INSERT_BATCH_SIZE]
+            cursor.executemany(insert_sql, batch)
+            inserted += len(batch)
+        logger.info(f"{time.ctime()} -- Inserted {inserted} row(s) into {target_table}.")
+
+        cursor.execute(
+            f"UPDATE {target_table} "
+            f"SET abstractVarchar = CAST(abstract AS CHAR(15000)) "
+            f"WHERE abstractVarchar IS NULL"
         )
-        cursor.execute(update_query)
-        logger.info(f"{time.ctime()} -- reporting_abstracts updated with varchar equivalents.")
+        logger.info(f"{time.ctime()} -- {target_table} updated with varchar equivalents.")
+    return inserted
+
 
 # ------------------------------------------------------------------------------
-# Main Script Logic
+# Dry Run
+# ------------------------------------------------------------------------------
+def run_dry_run(mysql_conn):
+    """
+    Verifies the fetch -> insert path end to end without modifying
+    reporting_abstracts: a random sample of missing PMIDs is processed into a
+    session-private TEMPORARY table, then verified and discarded.
+    """
+    logger.info("=== DRY RUN === reporting_abstracts will NOT be modified.")
+
+    all_pmids = fetch_missing_pmids(mysql_conn)
+    logger.info(f"{len(all_pmids)} PMID(s) currently missing abstracts in production.")
+    if not all_pmids:
+        logger.info("Nothing missing; no sample to process.")
+        mysql_conn.close()
+        return
+
+    sample = random.sample(all_pmids, min(DRY_RUN_SAMPLE, len(all_pmids)))
+    logger.info(f"Processing a random sample of {len(sample)} PMID(s) through the new insert path.")
+
+    try:
+        with mysql_conn.cursor() as cursor:
+            cursor.execute(f"CREATE TEMPORARY TABLE {DRY_RUN_TABLE} LIKE reporting_abstracts")
+
+        all_results = fetch_all_abstracts(sample)
+        logger.info(f"Fetched {len(all_results)} item(s) from DynamoDB (requested {len(sample)}).")
+        if not all_results:
+            logger.error("DRY RUN FAILED: DynamoDB returned nothing for the sample.")
+            return
+
+        poison = [
+            (p, a) for p, a in all_results
+            if a and any(c in a for c in ('"', '\t', '\n', '\r', '\\'))
+        ]
+        logger.info(
+            f"{len(poison)} of {len(all_results)} fetched abstracts contain "
+            f"quotes/tabs/newlines/backslashes -- the content the old LOAD DATA "
+            f"path silently dropped."
+        )
+        if poison:
+            logger.info(f"Example poison abstract (PMID {poison[0][0]}): {poison[0][1][:160]!r}")
+
+        inserted = insert_abstracts(mysql_conn, all_results, target_table=DRY_RUN_TABLE)
+
+        with mysql_conn.cursor() as cursor:
+            cursor.execute(
+                f"SELECT COUNT(*) c, COUNT(DISTINCT pmid) d, "
+                f"SUM(pmid IS NULL) nullp, SUM(abstractVarchar IS NULL) nullv "
+                f"FROM {DRY_RUN_TABLE}"
+            )
+            stats = cursor.fetchone()
+
+        counts_ok = (
+            stats["c"] == len(all_results)
+            and not stats["nullp"]
+            and not stats["nullv"]
+        )
+
+        # Content-integrity check: re-read the longest poison abstract verbatim.
+        integrity_ok = True
+        if poison:
+            worst_pmid, worst_abs = max(poison, key=lambda x: len(x[1]))
+            with mysql_conn.cursor() as cursor:
+                cursor.execute(
+                    f"SELECT abstract FROM {DRY_RUN_TABLE} WHERE pmid = %s", (worst_pmid,)
+                )
+                stored = cursor.fetchone()["abstract"]
+            if isinstance(stored, bytes):
+                stored = stored.decode("utf-8")
+            integrity_ok = (stored == worst_abs)
+            logger.info(
+                f"Content-integrity check on PMID {worst_pmid} "
+                f"({len(worst_abs)} chars, contains poison characters): "
+                f"{'MATCH' if integrity_ok else 'MISMATCH'}"
+            )
+
+        if counts_ok and integrity_ok:
+            logger.info(
+                f"DRY RUN PASSED -- {inserted} row(s) inserted; {stats['c']} present; "
+                f"{stats['d']} distinct PMIDs; 0 NULL pmids; 0 NULL abstractVarchar; "
+                f"content stored verbatim."
+            )
+        else:
+            logger.error(
+                f"DRY RUN FAILED -- rows={stats['c']} (expected {len(all_results)}); "
+                f"null_pmid={stats['nullp']}; null_varchar={stats['nullv']}; "
+                f"integrity_ok={integrity_ok}"
+            )
+    finally:
+        with mysql_conn.cursor() as cursor:
+            cursor.execute(f"DROP TEMPORARY TABLE IF EXISTS {DRY_RUN_TABLE}")
+        logger.info(f"Scratch table {DRY_RUN_TABLE} dropped.")
+        mysql_conn.close()
+
+
+# ------------------------------------------------------------------------------
+# Main
 # ------------------------------------------------------------------------------
 def main():
-    # 1) Connect to MySQL
     mysql_conn = connect_mysql_server(DB_USERNAME, DB_PASSWORD, DB_HOST, DB_NAME)
 
-    while True:
-        # 2) Fetch all missing PMIDs
+    if DRY_RUN:
+        run_dry_run(mysql_conn)
+        return
+
+    prev_missing = None
+    for cycle in range(1, MAX_CYCLES + 1):
         all_pmids = fetch_missing_pmids(mysql_conn)
         if not all_pmids:
             logger.info("No more missing abstracts. We are done.")
             break
 
-        logger.info(f"Found {len(all_pmids)} PMIDs needing abstracts.")
-
-        # 3) Remove any existing abstract.csv
-        csv_path = "abstract.csv"
-        if os.path.exists(csv_path):
-            os.remove(csv_path)
+        logger.info(f"Cycle {cycle}: found {len(all_pmids)} PMID(s) needing abstracts.")
 
-        # 4) Chunk the PMIDs
-        chunks = [
-            all_pmids[i : i + CHUNK_SIZE]
-            for i in range(0, len(all_pmids), CHUNK_SIZE)
-        ]
-        logger.info(f"Created {len(chunks)} chunk(s). Each chunk up to {CHUNK_SIZE} PMIDs.")
-
-        # Accumulate all results in memory for this iteration
-        all_results = []
-
-        # 5) Parallel fetch from DynamoDB
-        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
-            future_to_chunk = {
-                executor.submit(fetch_abstracts_for_chunk, c): c for c in chunks
-            }
-            for future in concurrent.futures.as_completed(future_to_chunk):
-                try:
-                    chunk_result = future.result()
-                    all_results.extend(chunk_result)
-                except Exception as e:
-                    logger.exception(f"Error fetching chunk: {e}")
-
-        logger.info(f"Fetched abstracts for {len(all_results)} PMIDs in this cycle.")
-
-        # 6) Write to CSV
-        with open(csv_path, "w", newline="", encoding="utf-8") as f:
-            writer = csv.writer(f, delimiter="\t")
-            writer.writerow(["pmid", "abstract"])
-            for pmid, abstract_text in all_results:
-                writer.writerow([pmid, abstract_text])
-
-        # 7) Load CSV into DB
-        load_csv_into_reporting_abstracts(mysql_conn, csv_path)
-
-        # We then loop again in case there are additional PMIDs that
-        # appeared or newly became missing. Usually, you won't see more,
-        # but if your data is updated behind the scenes, it handles that too.
+        # Safety net: if a cycle does not reduce the missing count, the
+        # remaining PMIDs cannot be resolved (no DynamoDB record). Stop rather
+        # than loop forever -- the failure mode that hung the nightly pipeline.
+        if prev_missing is not None and len(all_pmids) >= prev_missing:
+            logger.warning(
+                f"No progress since the previous cycle ({len(all_pmids)} PMID(s) "
+                f"still missing); stopping. These PMIDs have no retrievable abstract."
+            )
+            break
+        prev_missing = len(all_pmids)
+
+        all_results = fetch_all_abstracts(all_pmids)
+        logger.info(f"Fetched abstracts for {len(all_results)} PMID(s) from DynamoDB.")
+        insert_abstracts(mysql_conn, all_results)
+    else:
+        logger.warning(
+            f"Reached the {MAX_CYCLES}-cycle safety limit with abstracts still "
+            f"missing; stopping. A healthy run converges in 1-2 cycles -- investigate."
+        )
 
     mysql_conn.close()
-    logger.info("All missing abstracts have now been imported.")
+    logger.info("Abstract import complete.")
 
 
 if __name__ == "__main__":

From a97b535876afd1dd955f4f0632cebc875b39248a Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Sat, 9 May 2026 08:54:39 -0400
Subject: [PATCH 09/19] feat(reporter): NIH RePORTER ETL with provenance
 reconciliation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a new nightly ETL step (retrieveReporter.py) that pulls grant metadata
and pub-grant linkages from NIH RePORTER (api.reporter.nih.gov/v2) and
reconciles them against the existing PubMed-derived person_article_grant
table.

Three new tables (alter_add_reporter_fields_v1.2.sql):
  - grant_reporter_project — RePORTER /projects/search results, refreshed
    each cycle (truncate-reload). Includes abstract_text for cross-reference.
  - grant_reporter_link — RePORTER /publications/search (pmid, appl_id) pairs.
  - grant_provenance — long-lived per-(person, pmid, grant) audit log with
    source_reporter, source_reciterdb, *_first_seen, last_verified. Survives
    the nightly truncate-reload of person_article_grant. Keyed by
    (personIdentifier, pmid, core_project_num) where core_project_num is the
    normalized NIH grant identifier (e.g. R01DK127777).

ETL strategy:
  - Projects: filtered by org_names=["WEILL MEDICAL COLL OF CORNELL UNIV"],
    partitioned by fiscal year to stay under the 9,999 offset cap (WCM has
    ~15K projects historically). 1 req/sec rate limit honored.
  - Publications: keyed by appl_ids from projects, batched.
  - Reconciliation: bulk INSERT...SELECT ON DUPLICATE KEY UPDATE on both
    sides. Reciterdb side stages normalized grant strings via temp table +
    LOAD DATA LOCAL INFILE; RePORTER side joins grant_reporter_link to
    person_article (userAssertion='ACCEPTED') as the false-positive guard.
  - Subaward caveat: WCM-as-subaward will be missed by the org filter
    (false-negative tradeoff accepted to keep false positives near zero).

Validated on dev: 235K provenance rows from 33K reciterdb + 132K RePORTER
inputs. End-to-end ~18 min (15 min publications fetch is the rate-limited
floor; reconciliation completes in ~10 sec via bulk SQL).
---
 setup/alter_add_reporter_fields_v1.2.sql | 127 +++++++
 update/retrieveReporter.py               | 429 +++++++++++++++++++++++
 update/run_all.py                        |   3 +-
 3 files changed, 558 insertions(+), 1 deletion(-)
 create mode 100644 setup/alter_add_reporter_fields_v1.2.sql
 create mode 100644 update/retrieveReporter.py

diff --git a/setup/alter_add_reporter_fields_v1.2.sql b/setup/alter_add_reporter_fields_v1.2.sql
new file mode 100644
index 0000000..1520eb5
--- /dev/null
+++ b/setup/alter_add_reporter_fields_v1.2.sql
@@ -0,0 +1,127 @@
+-- =============================================================================
+-- Migration: NIH RePORTER integration (v1.2)
+-- =============================================================================
+-- Adds the tables needed to ingest pub-grant linkages and project metadata
+-- from NIH RePORTER (https://api.reporter.nih.gov/v2/) and to track per-pair
+-- provenance over time.
+--
+-- WHY SEPARATE TABLES (not columns on person_article_grant):
+--   person_article_grant is TRUNCATE-reloaded by updateReciterDB.py every
+--   night from ReCiter scoring output (see updateReciterDB.py:241). Any
+--   provenance columns added directly to that table would be wiped on each
+--   nightly run, defeating the purpose of *_first_seen tracking. The
+--   provenance table below is updated incrementally and survives reloads.
+--
+-- WHAT'S CREATED:
+--   1. grant_reporter_project — RePORTER /projects/search results
+--   2. grant_reporter_link    — RePORTER /publications/search results
+--   3. grant_provenance       — long-lived per-(person, pmid, grant)
+--                               source-and-timestamp log
+--
+-- Safe to run on prod and dev. Idempotent (CREATE TABLE IF NOT EXISTS).
+-- Run BEFORE deploying retrieveReporter.py.
+-- =============================================================================
+
+-- -----------------------------------------------------------------------------
+-- grant_reporter_project — RePORTER project metadata
+-- -----------------------------------------------------------------------------
+-- One row per RePORTER appl_id returned by /projects/search for the configured
+-- WCM org filter. Refreshed each ETL cycle (truncate-reload OK; no historical
+-- state to preserve here — RePORTER is the source of truth).
+--
+-- abstract_text is stored here as a cross-reference. The Funding UI reads
+-- abstracts from Postgres (Scholars-Profile-System) where they're joined to
+-- InfoEd grant rows; this column exists for ad-hoc analysis and future
+-- reciterdb-side consumers.
+-- -----------------------------------------------------------------------------
+
+CREATE TABLE IF NOT EXISTS `grant_reporter_project` (
+  `appl_id` int(11) NOT NULL,
+  `core_project_num` varchar(32) DEFAULT NULL,
+  `project_title` varchar(512) DEFAULT NULL,
+  `org_name` varchar(255) DEFAULT NULL,
+  `fiscal_year` smallint(6) DEFAULT NULL,
+  `activity_code` varchar(8) DEFAULT NULL,
+  `project_start_date` date DEFAULT NULL,
+  `project_end_date` date DEFAULT NULL,
+  `abstract_text` mediumtext DEFAULT NULL,
+  `last_fetched_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
+  PRIMARY KEY (`appl_id`),
+  KEY `core_project_num` (`core_project_num`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+-- -----------------------------------------------------------------------------
+-- grant_reporter_link — RePORTER pub-grant linkages
+-- -----------------------------------------------------------------------------
+-- One row per (pmid, appl_id) pair returned by /publications/search.
+-- Refreshed each ETL cycle (truncate-reload). The grant_provenance table
+-- below is what carries history.
+-- -----------------------------------------------------------------------------
+
+CREATE TABLE IF NOT EXISTS `grant_reporter_link` (
+  `id` int(11) NOT NULL AUTO_INCREMENT,
+  `pmid` int(11) NOT NULL,
+  `appl_id` int(11) NOT NULL,
+  `core_project_num` varchar(32) DEFAULT NULL,
+  `last_fetched_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
+  PRIMARY KEY (`id`),
+  UNIQUE KEY `uk_pmid_appl_id` (`pmid`, `appl_id`),
+  KEY `pmid` (`pmid`),
+  KEY `core_project_num` (`core_project_num`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+-- -----------------------------------------------------------------------------
+-- grant_provenance — per-(person, pmid, grant) source and timestamp log
+-- -----------------------------------------------------------------------------
+-- The audit log that survives nightly truncate-reload of person_article_grant.
+-- Keyed by (personIdentifier, pmid, core_project_num) where core_project_num
+-- is the normalized NIH grant identifier (e.g. "R01DK127777" — no year suffix,
+-- no spaces). For non-NIH grants the original articleGrant string is stored
+-- in core_project_num as a fallback so the row is still keyable.
+--
+-- Update logic (run nightly by retrieveReporter.py after person_article_grant
+-- has been refreshed by retrieveArticles.py):
+--
+--   1. UPSERT from person_article_grant: any (personIdentifier, pmid,
+--      normalized_grant) currently in person_article_grant gets
+--      source_reciterdb=1 and last_verified=NOW(). reciterdb_first_seen is
+--      set on first insert and never overwritten.
+--
+--   2. UPSERT from grant_reporter_link joined to person_article (where
+--      userAssertion='ACCEPTED' to scope to confirmed WCM authors): any
+--      (personIdentifier, pmid, core_project_num) seen in RePORTER gets
+--      source_reporter=1 and last_verified=NOW(). reporter_first_seen is
+--      set on first insert and never overwritten.
+--
+-- Subaward caution: see retrieveReporter.py — we filter RePORTER projects
+-- to org_names=["WEILL MEDICAL COLL OF CORNELL UNIV"] and join PMIDs to
+-- person_article ACCEPTED rows. This minimizes false positives at the cost
+-- of missing some legitimate WCM-as-subaward linkages.
+-- -----------------------------------------------------------------------------
+
+CREATE TABLE IF NOT EXISTS `grant_provenance` (
+  `id` int(11) NOT NULL AUTO_INCREMENT,
+  `personIdentifier` varchar(128) NOT NULL,
+  `pmid` int(11) NOT NULL,
+  `core_project_num` varchar(64) NOT NULL,
+  `appl_id` int(11) DEFAULT NULL,
+  `source_reporter` tinyint(1) NOT NULL DEFAULT 0,
+  `source_reciterdb` tinyint(1) NOT NULL DEFAULT 0,
+  `reporter_first_seen` datetime DEFAULT NULL,
+  `reciterdb_first_seen` datetime DEFAULT NULL,
+  `last_verified` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
+  PRIMARY KEY (`id`),
+  UNIQUE KEY `uk_person_pmid_grant` (`personIdentifier`, `pmid`, `core_project_num`),
+  KEY `pmid` (`pmid`),
+  KEY `appl_id` (`appl_id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+-- -----------------------------------------------------------------------------
+-- Verification
+-- -----------------------------------------------------------------------------
+
+SELECT table_name, table_rows, create_time
+FROM information_schema.tables
+WHERE table_schema = DATABASE()
+  AND table_name IN ('grant_reporter_project', 'grant_reporter_link', 'grant_provenance')
+ORDER BY table_name;
diff --git a/update/retrieveReporter.py b/update/retrieveReporter.py
new file mode 100644
index 0000000..7d938d7
--- /dev/null
+++ b/update/retrieveReporter.py
@@ -0,0 +1,429 @@
+# retrieveReporter.py
+#
+# Pulls grant metadata and pub-grant linkages from NIH RePORTER
+# (https://api.reporter.nih.gov/v2/) and reconciles them against the
+# ReCiter-derived person_article_grant table.
+#
+# Two API loops:
+#   1. POST /projects/search filtered by WCM org name → grant_reporter_project
+#   2. POST /publications/search keyed by appl_ids from step 1 → grant_reporter_link
+#
+# Then a SQL reconciliation step populates grant_provenance, the long-lived
+# (person, pmid, grant)-keyed audit log that survives the nightly truncate-
+# reload of person_article_grant. See setup/alter_add_reporter_fields_v1.2.sql
+# for the full design rationale.
+#
+# Why we filter by org_name rather than fetching everything:
+#   RePORTER returns ~thousands of WCM-attributed projects. Pulling the full
+#   corpus would require partitioning by FY (offset cap is 9,999) and gives
+#   no benefit for our use case. Subaward caveat: WCM-as-sub may not appear
+#   under this org filter — accepted as a false-negative tradeoff to keep
+#   false positives near zero.
+
+import os
+import sys
+import csv
+import time
+import random
+import re
+import logging
+import faulthandler
+import signal
+import requests
+import pymysql.cursors
+import pymysql.err
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('retrieveReporter.log', mode='w'),
+        logging.StreamHandler(sys.stdout),
+    ],
+)
+logger = logging.getLogger(__name__)
+
+faulthandler.enable(file=sys.stderr, all_threads=True)
+faulthandler.register(signal.SIGUSR1, file=sys.stderr, all_threads=True)
+
+REPORTER_BASE_URL = 'https://api.reporter.nih.gov/v2'
+WCM_ORG_NAME = 'WEILL MEDICAL COLL OF CORNELL UNIV'
+PAGE_LIMIT = 500
+OFFSET_CAP = 9999
+REQUEST_INTERVAL_SEC = 1.0  # NIH guidance: 1 req/sec
+PUBS_BATCH_SIZE = 50  # appl_ids per /publications/search call
+
+# core_project_num pattern, e.g. "R01DK127777", "U01AI189285", "K23MH112873".
+# Prefix is 1-3 alphanumeric (activity code) + 2 letters (IC) + 5-7 digits.
+CORE_PROJECT_RE = re.compile(r'\b([A-Z]\d{1,2}[A-Z]{2}\d{5,7})\b')
+
+
+def connect_db(max_retries=5, backoff_factor=1):
+    username = os.environ['DB_USERNAME']
+    password = os.environ['DB_PASSWORD']
+    hostname = os.environ['DB_HOST']
+    database = os.environ['DB_NAME']
+    for retry in range(max_retries):
+        try:
+            conn = pymysql.connect(
+                user=username,
+                password=password,
+                database=database,
+                host=hostname,
+                local_infile=True,
+                cursorclass=pymysql.cursors.DictCursor,
+            )
+            logger.info('Connected to database %s on %s', database, hostname)
+            return conn
+        except pymysql.err.MySQLError as err:
+            logger.error('DB connect attempt %d failed: %s', retry + 1, err)
+            time.sleep(backoff_factor * (2 ** retry) + random.uniform(0, 1))
+    raise RuntimeError('Could not connect to database after retries')
+
+
+def post_with_retry(url, payload, max_retries=5, backoff_factor=1):
+    """POST with exponential backoff. Honors NIH's 1 req/sec rate limit
+    by sleeping between successful calls in the caller."""
+    for retry in range(max_retries):
+        try:
+            r = requests.post(url, json=payload, timeout=(10, 90))
+            if r.status_code == 429:
+                wait = backoff_factor * (2 ** retry) + random.uniform(0, 5)
+                logger.warning('429 from RePORTER; sleeping %.1fs', wait)
+                time.sleep(wait)
+                continue
+            r.raise_for_status()
+            return r.json()
+        except requests.exceptions.RequestException as e:
+            wait = backoff_factor * (2 ** retry) + random.uniform(0, 1)
+            logger.error('RePORTER request failed (attempt %d): %s; sleep %.1fs',
+                         retry + 1, e, wait)
+            time.sleep(wait)
+    raise RuntimeError(f'RePORTER request failed after {max_retries} retries: {url}')
+
+
+def _fetch_projects_page(criteria):
+    """Yield project dicts for a single criteria block. Caller must ensure
+    the result set fits under OFFSET_CAP; we log and stop if it doesn't."""
+    url = f'{REPORTER_BASE_URL}/projects/search'
+    offset = 0
+    while offset <= OFFSET_CAP:
+        payload = {
+            'criteria': criteria,
+            'limit': PAGE_LIMIT,
+            'offset': offset,
+        }
+        data = post_with_retry(url, payload)
+        results = data.get('results', []) or []
+        if not results:
+            return
+        for row in results:
+            yield row
+        meta = data.get('meta', {}) or {}
+        total = meta.get('total', 0)
+        offset += PAGE_LIMIT
+        if offset >= total:
+            return
+        if offset > OFFSET_CAP:
+            logger.warning(
+                'Result set has %d records but offset cap is %d; truncating. '
+                'Caller should partition further (e.g. by activity_code).',
+                total, OFFSET_CAP)
+            return
+        time.sleep(REQUEST_INTERVAL_SEC)
+
+
+def fetch_projects(base_criteria):
+    """Yield project dicts, partitioning by fiscal year when needed to stay
+    under the offset cap. WCM has ~15K projects historically, which exceeds
+    the 9,999 offset limit on a single criteria block.
+
+    Strategy: probe total once with the base criteria. If under the cap,
+    return all in one stream. Otherwise iterate fiscal years from the
+    earliest NIH grant year (1985) through next year, requesting
+    fiscal_years=[FY] for each."""
+    probe = post_with_retry(
+        f'{REPORTER_BASE_URL}/projects/search',
+        {'criteria': base_criteria, 'limit': 1, 'offset': 0},
+    )
+    total = (probe.get('meta', {}) or {}).get('total', 0)
+    logger.info('RePORTER /projects/search reports %d total matches for base criteria', total)
+
+    if total <= OFFSET_CAP:
+        yield from _fetch_projects_page(base_criteria)
+        return
+
+    import datetime
+    end_fy = datetime.date.today().year + 1
+    for fy in range(1985, end_fy + 1):
+        criteria = dict(base_criteria)
+        criteria['fiscal_years'] = [fy]
+        yielded_this_fy = 0
+        for row in _fetch_projects_page(criteria):
+            yielded_this_fy += 1
+            yield row
+        if yielded_this_fy:
+            logger.info('FY %d: yielded %d projects', fy, yielded_this_fy)
+        time.sleep(REQUEST_INTERVAL_SEC)
+
+
+def fetch_publications_for_appl_ids(appl_ids):
+    """Yield (pmid, appl_id, core_project_num) tuples from /publications/search
+    in batches of PUBS_BATCH_SIZE."""
+    url = f'{REPORTER_BASE_URL}/publications/search'
+    appl_ids = list({int(x) for x in appl_ids if x is not None})
+    for i in range(0, len(appl_ids), PUBS_BATCH_SIZE):
+        batch = appl_ids[i:i + PUBS_BATCH_SIZE]
+        offset = 0
+        while offset <= OFFSET_CAP:
+            payload = {
+                'criteria': {'appl_ids': batch},
+                'limit': PAGE_LIMIT,
+                'offset': offset,
+            }
+            data = post_with_retry(url, payload)
+            results = data.get('results', []) or []
+            if not results:
+                break
+            for row in results:
+                pmid = row.get('pmid')
+                appl_id = row.get('applid') or row.get('appl_id')
+                core = row.get('coreproject') or row.get('core_project_num')
+                if pmid and appl_id:
+                    yield int(pmid), int(appl_id), core
+            meta = data.get('meta', {}) or {}
+            total = meta.get('total', 0)
+            offset += PAGE_LIMIT
+            if offset >= total:
+                break
+            time.sleep(REQUEST_INTERVAL_SEC)
+        time.sleep(REQUEST_INTERVAL_SEC)
+
+
+def reload_table(conn, table, rows, columns):
+    """Truncate `table` and insert `rows` (list of tuples matching `columns`).
+    Used for the staging tables grant_reporter_project and grant_reporter_link.
+    grant_provenance is upserted, not reloaded."""
+    placeholders = ', '.join(['%s'] * len(columns))
+    col_list = ', '.join(f'`{c}`' for c in columns)
+    cur = conn.cursor()
+    cur.execute(f'TRUNCATE TABLE `{table}`')
+    if rows:
+        sql = f'INSERT INTO `{table}` ({col_list}) VALUES ({placeholders})'
+        cur.executemany(sql, rows)
+    conn.commit()
+    cur.execute(f'SELECT COUNT(*) AS c FROM `{table}`')
+    count = cur.fetchone()['c']
+    logger.info('Reloaded %s: %d rows', table, count)
+
+
+def normalize_grant_string(raw):
+    """Extract a core project number (e.g. R01DK127777) from a free-text
+    NIH grant string. Returns None if no match — caller decides whether to
+    fall back to the raw string."""
+    if not raw:
+        return None
+    upper = re.sub(r'[\s\-\/]', '', raw.upper())
+    m = CORE_PROJECT_RE.search(upper)
+    return m.group(1) if m else None
+
+
+def reconcile_provenance(conn):
+    """Populate grant_provenance from person_article_grant and grant_reporter_link.
+
+    Bulk pattern: each side does a single INSERT...SELECT with ON DUPLICATE
+    KEY UPDATE so we make one round trip per side instead of one per row.
+    First_seen timestamps stick because they're only in the INSERT clause,
+    not the UPDATE clause."""
+    import tempfile
+    cur = conn.cursor()
+
+    # ----- (1) reciterdb side -----
+    # Normalization (free-text articleGrant → core_project_num) happens in
+    # Python, so we stage the normalized rows in a temp table first via
+    # LOAD DATA LOCAL INFILE, then do a single bulk upsert.
+    logger.info('Reading person_article_grant for reconciliation')
+    cur.execute("""
+        SELECT personIdentifier, pmid, articleGrant
+        FROM person_article_grant
+        WHERE personIdentifier IS NOT NULL
+          AND pmid > 0
+          AND articleGrant IS NOT NULL
+          AND articleGrant <> ''
+    """)
+    pag_rows = cur.fetchall()
+    logger.info('person_article_grant rows considered: %d', len(pag_rows))
+
+    # Normalize + dedupe in Python (the temp table's PK enforces uniqueness
+    # but deduping here avoids LOAD DATA INFILE warnings on duplicate rows).
+    seen = set()
+    normalized = []
+    for row in pag_rows:
+        n = normalize_grant_string(row['articleGrant'])
+        if not n:
+            # Non-NIH fallback: sanitize control chars (CSV uses TAB delim)
+            n = re.sub(r'[\t\n\r]', ' ', row['articleGrant'])[:64]
+        key = (row['personIdentifier'], row['pmid'], n)
+        if key in seen:
+            continue
+        seen.add(key)
+        normalized.append(key)
+    logger.info('Normalized + deduped to %d distinct (person, pmid, grant) rows',
+                len(normalized))
+
+    csv_file = tempfile.NamedTemporaryFile(
+        delete=False, mode='w', suffix='.csv', newline='', encoding='utf-8')
+    try:
+        writer = csv.writer(csv_file, delimiter='\t', lineterminator='\n',
+                            quoting=csv.QUOTE_NONE, escapechar='\\')
+        for r in normalized:
+            writer.writerow(r)
+        csv_file.close()
+
+        cur.execute("DROP TEMPORARY TABLE IF EXISTS _reciter_grant_staging")
+        cur.execute("""
+            CREATE TEMPORARY TABLE _reciter_grant_staging (
+                personIdentifier VARCHAR(128) NOT NULL,
+                pmid INT NOT NULL,
+                core_project_num VARCHAR(64) NOT NULL,
+                PRIMARY KEY (personIdentifier, pmid, core_project_num)
+            ) ENGINE=InnoDB
+        """)
+        load_sql = (
+            f"LOAD DATA LOCAL INFILE '{csv_file.name}' "
+            "INTO TABLE _reciter_grant_staging "
+            "FIELDS TERMINATED BY '\\t' LINES TERMINATED BY '\\n' "
+            "(personIdentifier, pmid, core_project_num)"
+        )
+        cur.execute(load_sql)
+        cur.execute("SELECT COUNT(*) AS c FROM _reciter_grant_staging")
+        logger.info('Loaded %d rows into reciterdb staging table',
+                    cur.fetchone()['c'])
+
+        cur.execute("""
+            INSERT INTO grant_provenance
+                (personIdentifier, pmid, core_project_num,
+                 source_reciterdb, reciterdb_first_seen, last_verified)
+            SELECT personIdentifier, pmid, core_project_num,
+                   1, NOW(), NOW()
+            FROM _reciter_grant_staging
+            ON DUPLICATE KEY UPDATE
+                source_reciterdb = 1,
+                last_verified = NOW()
+        """)
+        # rowcount on bulk upsert is "1 per insert + 2 per update" in MariaDB
+        # — informative, not exact
+        logger.info('Reciterdb-side bulk upsert: %d rowcount', cur.rowcount)
+        cur.execute("DROP TEMPORARY TABLE _reciter_grant_staging")
+        conn.commit()
+    finally:
+        try:
+            os.unlink(csv_file.name)
+        except OSError:
+            pass
+
+    # ----- (2) RePORTER side -----
+    # Pure SQL — no Python iteration. The JOIN to person_article enforces
+    # the false-positive guard (only ACCEPTED PMIDs credit a person).
+    # GROUP BY collapses cases where one (person, pmid, core_project) has
+    # multiple appl_ids (different fiscal years of the same grant); MAX
+    # picks the most recent appl_id deterministically.
+    logger.info('Running RePORTER-side bulk upsert')
+    cur.execute("""
+        INSERT INTO grant_provenance
+            (personIdentifier, pmid, core_project_num, appl_id,
+             source_reporter, reporter_first_seen, last_verified)
+        SELECT pa.personIdentifier, grl.pmid, grl.core_project_num,
+               MAX(grl.appl_id), 1, NOW(), NOW()
+        FROM grant_reporter_link grl
+        JOIN person_article pa
+          ON pa.pmid = grl.pmid
+         AND pa.userAssertion = 'ACCEPTED'
+        WHERE grl.core_project_num IS NOT NULL
+        GROUP BY pa.personIdentifier, grl.pmid, grl.core_project_num
+        ON DUPLICATE KEY UPDATE
+            source_reporter = 1,
+            appl_id = COALESCE(VALUES(appl_id), grant_provenance.appl_id),
+            last_verified = NOW()
+    """)
+    logger.info('RePORTER-side bulk upsert: %d rowcount', cur.rowcount)
+    conn.commit()
+
+    # Summary
+    cur.execute("SELECT COUNT(*) AS c FROM grant_provenance")
+    total = cur.fetchone()['c']
+    cur.execute("SELECT COUNT(*) AS c FROM grant_provenance WHERE source_reporter = 1 AND source_reciterdb = 1")
+    both = cur.fetchone()['c']
+    cur.execute("SELECT COUNT(*) AS c FROM grant_provenance WHERE source_reporter = 1 AND source_reciterdb = 0")
+    rep_only = cur.fetchone()['c']
+    cur.execute("SELECT COUNT(*) AS c FROM grant_provenance WHERE source_reporter = 0 AND source_reciterdb = 1")
+    reciter_only = cur.fetchone()['c']
+    logger.info('Provenance totals: %d rows | both=%d | reporter-only=%d | reciter-only=%d',
+                total, both, rep_only, reciter_only)
+
+
+def main():
+    org_name = os.environ.get('REPORTER_ORG_NAME', WCM_ORG_NAME)
+    logger.info('Starting RePORTER ETL for org: %s', org_name)
+
+    conn = connect_db()
+
+    # ----- Loop A: projects -----
+    # No include_fields — the API expects CamelCase there ('ApplId') but
+    # response field names are snake_case ('appl_id'). Easier to take all
+    # fields back than maintain two name conventions.
+    project_rows = []
+    appl_ids = []
+    for proj in fetch_projects(base_criteria={'org_names': [org_name]}):
+        appl_id = proj.get('appl_id')
+        if not appl_id:
+            continue
+        appl_ids.append(appl_id)
+        org = (proj.get('organization') or {}).get('org_name')
+        project_rows.append((
+            int(appl_id),
+            proj.get('core_project_num'),
+            (proj.get('project_title') or '')[:512],
+            (org or '')[:255],
+            proj.get('fiscal_year'),
+            proj.get('activity_code'),
+            proj.get('project_start_date'),
+            proj.get('project_end_date'),
+            proj.get('abstract_text'),
+        ))
+    logger.info('Fetched %d RePORTER projects', len(project_rows))
+    reload_table(
+        conn,
+        'grant_reporter_project',
+        project_rows,
+        ['appl_id', 'core_project_num', 'project_title', 'org_name',
+         'fiscal_year', 'activity_code', 'project_start_date',
+         'project_end_date', 'abstract_text'],
+    )
+
+    # ----- Loop B: publications -----
+    link_rows = []
+    seen_pairs = set()
+    for pmid, appl_id, core in fetch_publications_for_appl_ids(appl_ids):
+        key = (pmid, appl_id)
+        if key in seen_pairs:
+            continue
+        seen_pairs.add(key)
+        link_rows.append((pmid, appl_id, core))
+    logger.info('Fetched %d unique (pmid, appl_id) pairs', len(link_rows))
+    reload_table(
+        conn,
+        'grant_reporter_link',
+        link_rows,
+        ['pmid', 'appl_id', 'core_project_num'],
+    )
+
+    # ----- Reconciliation -----
+    reconcile_provenance(conn)
+
+    conn.close()
+    logger.info('RePORTER ETL complete')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/update/run_all.py b/update/run_all.py
index e53e2fb..6ebe245 100644
--- a/update/run_all.py
+++ b/update/run_all.py
@@ -112,7 +112,8 @@ def main():
     scripts = [
         ("executeFeatureGenerator", "python3 executeFeatureGenerator.py"),
         ("retrieveArticles", "python3 retrieveArticles.py"),
-        ("retrieveNIH", "python3 retrieveNIH.py"),  
+        ("retrieveNIH", "python3 retrieveNIH.py"),
+        ("retrieveReporter", "python3 retrieveReporter.py"),
         ("nightlyIndexing", "bash run_nightly_indexing.sh"),
         ("abstractImport", "python3 abstractImport.py"),
         ("conflictsImport", "python3 conflictsImport.py")

From 3d88b9221c00a95033619a2c43d11211058776c6 Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Mon, 18 May 2026 00:17:11 -0400
Subject: [PATCH 10/19] feat(reporter): capture NIH RePORTER project terms
 (#291)

RePORTER /projects/search returns NIH-curated keyword vocabularies
alongside the abstract. Capture them into grant_reporter_project so the
Scholars-Profile-System funding ETL can project them onto grants as a
topical search signal (issue #291).

- alter_add_reporter_terms_v1.3.sql: ADD COLUMN project_terms, pref_terms
  via the information_schema-guarded idiom, safe on the live table.
- v1.2 CREATE TABLE: mirror the two columns for fresh builds.
- retrieveReporter.py: pull `terms` and `pref_terms` from each project
  dict, stored raw (terms angle-bracket-wrapped, pref_terms
  semicolon-delimited).
---
 setup/alter_add_reporter_fields_v1.2.sql |  7 +++
 setup/alter_add_reporter_terms_v1.3.sql  | 59 ++++++++++++++++++++++++
 update/retrieveReporter.py               |  7 ++-
 3 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 setup/alter_add_reporter_terms_v1.3.sql

diff --git a/setup/alter_add_reporter_fields_v1.2.sql b/setup/alter_add_reporter_fields_v1.2.sql
index 1520eb5..39c36bb 100644
--- a/setup/alter_add_reporter_fields_v1.2.sql
+++ b/setup/alter_add_reporter_fields_v1.2.sql
@@ -33,6 +33,11 @@
 -- abstracts from Postgres (Scholars-Profile-System) where they're joined to
 -- InfoEd grant rows; this column exists for ad-hoc analysis and future
 -- reciterdb-side consumers.
+--
+-- project_terms / pref_terms hold the NIH-curated keyword vocabulary RePORTER
+-- returns per project, stored raw (project_terms angle-bracket-wrapped,
+-- pref_terms semicolon-delimited). Added by alter_add_reporter_terms_v1.3.sql;
+-- mirrored into the CREATE TABLE here so a fresh build matches (issue #291).
 -- -----------------------------------------------------------------------------
 
 CREATE TABLE IF NOT EXISTS `grant_reporter_project` (
@@ -45,6 +50,8 @@ CREATE TABLE IF NOT EXISTS `grant_reporter_project` (
   `project_start_date` date DEFAULT NULL,
   `project_end_date` date DEFAULT NULL,
   `abstract_text` mediumtext DEFAULT NULL,
+  `project_terms` text DEFAULT NULL,
+  `pref_terms` text DEFAULT NULL,
   `last_fetched_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP,
   PRIMARY KEY (`appl_id`),
   KEY `core_project_num` (`core_project_num`)
diff --git a/setup/alter_add_reporter_terms_v1.3.sql b/setup/alter_add_reporter_terms_v1.3.sql
new file mode 100644
index 0000000..0b4d634
--- /dev/null
+++ b/setup/alter_add_reporter_terms_v1.3.sql
@@ -0,0 +1,59 @@
+-- =============================================================================
+-- Migration: NIH RePORTER project terms (v1.3)
+-- =============================================================================
+-- Adds two columns to grant_reporter_project for the NIH-curated keyword
+-- vocabulary RePORTER returns alongside the abstract:
+--   - project_terms — RePORTER `terms`, angle-bracket-wrapped (<a><b><c>)
+--   - pref_terms    — RePORTER `pref_terms`, semicolon-delimited (a;b;c)
+--
+-- Stored raw, verbatim from the API. Parsing into a keyword array happens
+-- downstream in the Scholars-Profile-System ETL (issue #291); reciterdb keeps
+-- the unparsed strings so a future reciterdb-side consumer can re-parse.
+--
+-- WHY AN ALTER, NOT THE CREATE TABLE in v1.2:
+--   alter_add_reporter_fields_v1.2.sql creates grant_reporter_project with
+--   CREATE TABLE IF NOT EXISTS — a no-op once the table exists, so editing its
+--   body would not add columns to a live table. This file uses the
+--   information_schema-guarded ALTER idiom (cf. v1.1) so it is safe on a
+--   populated prod/dev table. The two columns were also added to v1.2's
+--   CREATE TABLE so a fresh build matches.
+--
+-- Safe to run on prod and dev. Idempotent (information_schema guard; no-op on
+-- re-run). No AFTER clause — keeps ALGORITHM=INSTANT eligible.
+--
+-- Run BEFORE deploying the updated retrieveReporter.py, otherwise the project
+-- INSERT will fail with "Unknown column" on the 2 new fields.
+-- =============================================================================
+
+-- -----------------------------------------------------------------------------
+-- grant_reporter_project: + project_terms + pref_terms
+-- -----------------------------------------------------------------------------
+
+SET @db = DATABASE();
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'grant_reporter_project'
+       AND column_name = 'project_terms') = 0,
+    'ALTER TABLE grant_reporter_project ADD COLUMN `project_terms` text DEFAULT NULL',
+    'SELECT ''grant_reporter_project.project_terms already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'grant_reporter_project'
+       AND column_name = 'pref_terms') = 0,
+    'ALTER TABLE grant_reporter_project ADD COLUMN `pref_terms` text DEFAULT NULL',
+    'SELECT ''grant_reporter_project.pref_terms already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+-- -----------------------------------------------------------------------------
+-- Verification
+-- -----------------------------------------------------------------------------
+
+SELECT table_name, column_name, data_type, is_nullable
+FROM information_schema.columns
+WHERE table_schema = DATABASE()
+  AND table_name = 'grant_reporter_project'
+  AND column_name IN ('project_terms', 'pref_terms')
+ORDER BY ordinal_position;
diff --git a/update/retrieveReporter.py b/update/retrieveReporter.py
index 7d938d7..2088dc6 100644
--- a/update/retrieveReporter.py
+++ b/update/retrieveReporter.py
@@ -390,6 +390,11 @@ def main():
             proj.get('project_start_date'),
             proj.get('project_end_date'),
             proj.get('abstract_text'),
+            # NIH-curated keyword vocabularies, stored raw (issue #291).
+            # 'terms' is angle-bracket-wrapped (<a><b><c>); 'pref_terms' is
+            # semicolon-delimited. Parsed downstream by the SPS funding ETL.
+            proj.get('terms'),
+            proj.get('pref_terms'),
         ))
     logger.info('Fetched %d RePORTER projects', len(project_rows))
     reload_table(
@@ -398,7 +403,7 @@ def main():
         project_rows,
         ['appl_id', 'core_project_num', 'project_title', 'org_name',
          'fiscal_year', 'activity_code', 'project_start_date',
-         'project_end_date', 'abstract_text'],
+         'project_end_date', 'abstract_text', 'project_terms', 'pref_terms'],
     )
 
     # ----- Loop B: publications -----

From 001c1bd6677ecbb3545e9d49769d83d0882a5606 Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Mon, 18 May 2026 08:53:36 -0400
Subject: [PATCH 11/19] fix(reporter): dedup projects by appl_id before
 grant_reporter_project reload

fetch_projects() partitions the search by fiscal year when the corpus
exceeds RePORTER's 9,999 offset cap (WCM has ~15K projects). RePORTER
returns a multi-year project under every fiscal year it was active, so
the same appl_id comes back in multiple FY pages. appl_id is
grant_reporter_project's PRIMARY KEY, so the unguarded reload hit
"IntegrityError 1062: Duplicate entry for key 'PRIMARY'" and the run
aborted after TRUNCATE had already emptied the table.

Dedup the projects loop by appl_id, mirroring the seen_pairs guard the
publications loop already uses.
---
 update/retrieveReporter.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/update/retrieveReporter.py b/update/retrieveReporter.py
index 2088dc6..36c8e88 100644
--- a/update/retrieveReporter.py
+++ b/update/retrieveReporter.py
@@ -374,10 +374,19 @@ def main():
     # fields back than maintain two name conventions.
     project_rows = []
     appl_ids = []
+    seen_appl_ids = set()
     for proj in fetch_projects(base_criteria={'org_names': [org_name]}):
         appl_id = proj.get('appl_id')
         if not appl_id:
             continue
+        # RePORTER returns a project under every fiscal year it was active, so
+        # the FY-partitioned fetch (used when the corpus exceeds the 9,999
+        # offset cap) yields the same appl_id in multiple pages. appl_id is
+        # grant_reporter_project's PRIMARY KEY, so dedup before the reload —
+        # mirrors the seen_pairs guard in the publications loop below.
+        if appl_id in seen_appl_ids:
+            continue
+        seen_appl_ids.add(appl_id)
         appl_ids.append(appl_id)
         org = (proj.get('organization') or {}).get('org_name')
         project_rows.append((

From 2ada77f963ec633006b1720bf808228fecd77abf Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Tue, 19 May 2026 07:44:06 -0400
Subject: [PATCH 12/19] fix(docker): add missing COPY for retrieveReporter.py

run_all.py invokes retrieveReporter.py, but the Dockerfile's per-file
COPY list was never updated when the RePORTER ETL step was added (PR #81),
so the built image lacked the script. Every nightly run crashed at step 4
with "can't open file '/usr/src/app/retrieveReporter.py'", halting the
pipeline before nightly indexing, abstractImport, and conflictsImport.
With restartPolicy: OnFailure this produced an indefinite ~95-min restart loop.
---
 Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile b/Dockerfile
index 8501d0b..f4f344b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -13,6 +13,7 @@ ENV PYTHONUNBUFFERED=1
 
 # Copy additional Python scripts
 COPY update/retrieveNIH.py ./
+COPY update/retrieveReporter.py ./
 COPY update/retrieveAltmetric.py ./
 COPY update/retrieveArticles.py ./
 COPY update/updateReciterDB.py ./

From 87d9443c0a9bedd4fc0b3e038ef4726a69b3091b Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Wed, 20 May 2026 17:34:34 -0400
Subject: [PATCH 13/19] fix(reporting_abstracts): repair cross-paper
 concatenation from old CSV/LOAD DATA path

The pre-PR-#78 abstractImport.py wrote rows with csv.writer (RFC-4180
doubled-quote escaping) and LOAD DATA LOCAL INFILE (backslash escaping).
On rows whose abstract text contained ", MySQL's parser desynced and
consumed multiple TSV rows into one field until the next recoverable
delimiter, producing a single reporting_abstracts row attributed to one
pmid but containing concatenated text from several other papers in the
same DynamoDB batch.

The May 16 fix (PR #78) stopped new corruption but did not repair
existing rows. fetch_missing_pmids() uses LEFT JOIN ... WHERE a.pmid IS
NULL, so any pmid with a (corrupted) row was permanently skipped.

Audit against prod found 3,124 corrupted rows out of 391,238 total.
84% are pegged at the 65,535-byte BLOB cap with the original pmid's
abstract at the head and text from several unrelated papers at the tail.
181 additional rows are clean-pair duplicates from the same era (no
UNIQUE constraint on pmid; concurrent old-import runs produced identical
content twice). Verification against DynamoDB also spared 464 legitimately
long abstracts (structured / consensus / multi-arm trials, lengths up to
~32K) that a length-only filter would have wrongly purged.

This change ships the tooling and schema for the cleanup:

- update/auditAbstracts.py -- read-only forensic audit. Compares
  reporting_abstracts.abstract against DynamoDB.PubMedArticle (the same
  source abstractImport.py uses) and classifies each row >= 4000 chars
  as CLEAN, PREFIX_CORRUPTED, DISJOINT, EMPTY_IN_DYNAMO, or
  MISSING_IN_DYNAMO. Writes audit_abstracts.csv plus a dump of the worst
  offenders.

- update/repairAbstracts.py -- destructive cleanup. Backs up affected
  rows to a timestamped table, deletes corrupted rows in batches, then
  dedupes any remaining pmid duplicates by keeping MIN(id). Requires
  --apply; default is dry-run. Confirms the v1.4 migration precondition
  (no duplicates) after running.

- setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql -- adds UNIQUE
  KEY on reporting_abstracts.pmid. information_schema-guarded; aborts
  if any duplicates remain. Mirrors the analysis_nih fix from PR #71
  after the Dec 2025 duplicate-loading incident.

- setup/createDatabaseTableReciterDb.sql -- idx_pmid is now UNIQUE for
  fresh installs.

- .gitignore -- excludes audit / repair artifacts at the repo root
  (they contain prod abstract text).

DBA runbook after merge:
  1. python3 update/auditAbstracts.py
  2. python3 update/repairAbstracts.py            (dry-run; review counts)
  3. python3 update/repairAbstracts.py --apply
  4. mysql ... < setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql
  5. Next nightly abstractImport.py backfills the deleted PMIDs via the
     parameterized path.

Refs: #87, PR #78.
---
 .gitignore                                    |   8 +
 ...r_add_uq_pmid_reporting_abstracts_v1.4.sql |  84 +++++
 setup/createDatabaseTableReciterDb.sql        |   2 +-
 update/auditAbstracts.py                      | 318 ++++++++++++++++
 update/repairAbstracts.py                     | 339 ++++++++++++++++++
 5 files changed, 750 insertions(+), 1 deletion(-)
 create mode 100644 setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql
 create mode 100644 update/auditAbstracts.py
 create mode 100644 update/repairAbstracts.py

diff --git a/.gitignore b/.gitignore
index 0f6ba4c..b2dd5b2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,14 @@ update/*.log
 update/app.log
 update/retrieveNIH.log
 update/temp/
+retrieveNIH.log
+
+# One-shot audit / repair artifacts (contain prod abstract text; never commit)
+audit_abstracts.csv
+audit_abstracts_dump.txt
+invalid_pmids.txt
+invalid_pmids.sql
+reporting_abstracts_corrupt_backup_*.sql
 
 # Legacy ML models (unused)
 update/*.keras
diff --git a/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql b/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql
new file mode 100644
index 0000000..a6e3211
--- /dev/null
+++ b/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql
@@ -0,0 +1,84 @@
+-- =============================================================================
+-- Migration: UNIQUE KEY on reporting_abstracts.pmid (v1.4)
+-- =============================================================================
+-- Replaces the existing non-unique `idx_pmid` index on reporting_abstracts
+-- with a UNIQUE KEY so the parser-desync class of failure that corrupted
+-- ~3,100 rows historically (issue #87, pre-PR #78 CSV / LOAD DATA path) can
+-- no longer silently produce duplicate-pmid rows.
+--
+-- WHY THIS IS NEEDED:
+--   update/abstractImport.py's fetch_missing_pmids() uses
+--     LEFT JOIN reporting_abstracts a ON a.pmid = p.pmid WHERE a.pmid IS NULL
+--   so the import path *assumes* one-row-per-pmid. The schema never
+--   enforced it. This migration codifies the assumption, mirroring the
+--   analysis_nih fix from March (PR #71/#72 after the Dec 2025 duplicate
+--   loading incident).
+--
+-- PRECONDITION:
+--   reporting_abstracts must contain zero duplicate pmids. The
+--   information_schema-guarded block at the top aborts the migration with a
+--   readable error if duplicates remain (run update/repairAbstracts.py
+--   first; it warns when duplicates are present).
+--
+-- Safe to run on prod and dev. Idempotent (information_schema guard;
+-- re-runs are no-ops once the UNIQUE KEY exists). No AFTER clause; the
+-- ALTER converts the existing BTREE index in place.
+-- =============================================================================
+
+SET @db = DATABASE();
+
+-- -----------------------------------------------------------------------------
+-- Precondition: no duplicate pmids.
+-- -----------------------------------------------------------------------------
+
+SET @dup_count = (
+    SELECT COUNT(*) FROM (
+        SELECT pmid FROM reporting_abstracts
+        GROUP BY pmid HAVING COUNT(*) > 1
+    ) d
+);
+
+SET @sql = IF(
+    @dup_count > 0,
+    CONCAT(
+        'SELECT ',
+        '''Migration aborted: reporting_abstracts has ',
+        @dup_count,
+        ' duplicate pmid value(s). Run update/repairAbstracts.py and resolve ',
+        'duplicates before re-running this migration.'' AS error, ',
+        '1/0 AS force_error'
+    ),
+    'SELECT ''No duplicate pmids; precondition satisfied.'' AS status'
+);
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+-- -----------------------------------------------------------------------------
+-- reporting_abstracts.idx_pmid: KEY -> UNIQUE KEY
+-- -----------------------------------------------------------------------------
+
+SET @already_unique = (
+    SELECT COUNT(*) FROM information_schema.statistics
+    WHERE table_schema = @db
+      AND table_name = 'reporting_abstracts'
+      AND index_name = 'idx_pmid'
+      AND non_unique = 0
+);
+
+SET @sql = IF(
+    @already_unique > 0,
+    'SELECT ''reporting_abstracts.idx_pmid is already UNIQUE; no-op.''',
+    'ALTER TABLE reporting_abstracts
+       DROP INDEX idx_pmid,
+       ADD UNIQUE KEY idx_pmid (pmid) USING BTREE'
+);
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+-- -----------------------------------------------------------------------------
+-- Verification
+-- -----------------------------------------------------------------------------
+
+SELECT table_name, index_name, non_unique, column_name, index_type
+FROM information_schema.statistics
+WHERE table_schema = DATABASE()
+  AND table_name = 'reporting_abstracts'
+  AND index_name = 'idx_pmid';
diff --git a/setup/createDatabaseTableReciterDb.sql b/setup/createDatabaseTableReciterDb.sql
index cdedbd7..91c8641 100644
--- a/setup/createDatabaseTableReciterDb.sql
+++ b/setup/createDatabaseTableReciterDb.sql
@@ -801,7 +801,7 @@ CREATE TABLE IF NOT EXISTS `reporting_abstracts` (
   `abstract` blob DEFAULT NULL,
   `abstractVarchar` varchar(15000) DEFAULT NULL,
   PRIMARY KEY (`id`),
-  KEY `idx_pmid` (`pmid`) USING BTREE
+  UNIQUE KEY `idx_pmid` (`pmid`) USING BTREE
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
 
 CREATE TABLE IF NOT EXISTS `reporting_ad_hoc_feature_generator_execution` (
diff --git a/update/auditAbstracts.py b/update/auditAbstracts.py
new file mode 100644
index 0000000..b56dbcb
--- /dev/null
+++ b/update/auditAbstracts.py
@@ -0,0 +1,318 @@
+"""
+auditAbstracts.py -- one-shot forensic audit of reporting_abstracts.
+
+Pulls rows where LENGTH(abstract) >= AUDIT_LENGTH_THRESHOLD, fetches the
+DynamoDB ground truth for each PMID via the same path abstractImport.py
+uses, and classifies each row:
+
+  CLEAN              DB matches Dynamo (long but legitimate abstract).
+  PREFIX_CORRUPTED   First ~150 chars of the Dynamo abstract appear near
+                     the start of the DB blob and DB is substantially
+                     longer than Dynamo -- the cross-paper concatenation
+                     pattern produced by the old CSV / LOAD DATA path.
+  DISJOINT           DB front does not match Dynamo front; needs manual
+                     review.
+  MISSING_IN_DYNAMO  DynamoDB has no PubMedArticle record for the PMID.
+  EMPTY_IN_DYNAMO    Record present but yields empty abstract.
+
+Outputs:
+  - audit_abstracts.csv     one row per PMID examined
+  - audit_abstracts_dump.txt full text dump of the top N corrupted rows
+  - per-verdict counters and worst-offender summary to stdout
+
+Read-only. Does not modify reporting_abstracts.
+
+Env:
+  DB_USERNAME, DB_PASSWORD, DB_HOST, DB_NAME
+  AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_DEFAULT_REGION
+  AUDIT_LENGTH_THRESHOLD (default 4000)
+  AUDIT_MAX_CANDIDATES   (default 1000)
+"""
+
+import concurrent.futures
+import csv
+import logging
+import os
+import sys
+import time
+
+import boto3
+import pymysql.cursors
+import pymysql.err
+
+
+DB_USERNAME = os.getenv("DB_USERNAME")
+DB_PASSWORD = os.getenv("DB_PASSWORD")
+DB_HOST = os.getenv("DB_HOST")
+DB_NAME = os.getenv("DB_NAME")
+
+LENGTH_THRESHOLD = int(os.getenv("AUDIT_LENGTH_THRESHOLD", "4000"))
+MAX_CANDIDATES = int(os.getenv("AUDIT_MAX_CANDIDATES", "1000"))
+
+CHUNK_SIZE = 100
+MAX_WORKERS = 5
+MAX_UNPROCESSED_RETRIES = 8
+
+OUTPUT_CSV = "audit_abstracts.csv"
+DUMP_FILE = "audit_abstracts_dump.txt"
+DUMP_TOP_N = 5
+
+# Compare on the first HEAD_SAMPLE chars of the Dynamo abstract; require
+# it to be found within the first HEAD_SEARCH_WINDOW chars of the DB blob.
+# Short enough to tolerate leading-character noise (the orphan `"` and
+# similar CSV artifacts), long enough to be specific.
+HEAD_SAMPLE = 150
+HEAD_SEARCH_WINDOW = 400
+# A DB blob this much longer than Dynamo is the concatenation signal.
+LENGTH_INFLATION_RATIO = 1.3
+# BLOB-cap rule: a row right at the column cap with a Dynamo abstract many
+# times smaller is the parser-desync fingerprint regardless of whether the
+# first 150 chars happen to match (PubMed sometimes updated section labels
+# between the original CSV load and now, which can defeat the head-string
+# match).
+BLOB_CAP_THRESHOLD = 60000
+BLOB_CAP_INFLATION_RATIO = 5
+
+logging.basicConfig(
+    stream=sys.stdout,
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+logging.getLogger("botocore").setLevel(logging.WARNING)
+logging.getLogger("boto3").setLevel(logging.WARNING)
+
+
+def connect_mysql():
+    try:
+        return pymysql.connect(
+            user=DB_USERNAME,
+            password=DB_PASSWORD,
+            database=DB_NAME,
+            host=DB_HOST,
+            autocommit=True,
+            charset="utf8mb4",
+            cursorclass=pymysql.cursors.DictCursor,
+        )
+    except pymysql.err.MySQLError as err:
+        logger.error(f"DB connection failed: {err}")
+        sys.exit(1)
+
+
+def fetch_candidates(conn, threshold, max_rows):
+    sql = """
+        SELECT pmid, LENGTH(abstract) AS db_len, abstract
+        FROM reporting_abstracts
+        WHERE LENGTH(abstract) >= %s
+        ORDER BY LENGTH(abstract) DESC
+        LIMIT %s
+    """
+    with conn.cursor() as cur:
+        cur.execute(sql, (threshold, max_rows))
+        rows = cur.fetchall()
+    for r in rows:
+        if isinstance(r["abstract"], (bytes, bytearray)):
+            r["abstract"] = r["abstract"].decode("utf-8", errors="replace")
+        r["abstract"] = r["abstract"].replace("\r\n", "\n")
+    return rows
+
+
+def get_abstract(item):
+    """Same extraction logic as update/abstractImport.py:99."""
+    medline_citation = item.get("pubmedarticle", {}).get("medlinecitation")
+    if not medline_citation:
+        return ""
+    article = medline_citation.get("article")
+    if not article:
+        return ""
+    publication_abstract = article.get("publicationAbstract")
+    if not publication_abstract:
+        return ""
+    abstract_texts = []
+    for abstract_part in publication_abstract.get("abstractTexts", []):
+        label = abstract_part.get("abstractTextLabel")
+        text = abstract_part.get("abstractText")
+        if text:
+            label_text = f"{label}: " if label else ""
+            abstract_texts.append(label_text + text)
+    return " ".join(abstract_texts) if abstract_texts else ""
+
+
+def fetch_abstracts_from_dynamo(pmids):
+    client = boto3.resource("dynamodb").meta.client
+
+    def fetch_chunk(chunk):
+        request_keys = [{"pmid": p} for p in chunk]
+        results = {}
+        present = set()
+        attempt = 0
+        while request_keys:
+            response = client.batch_get_item(
+                RequestItems={"PubMedArticle": {"Keys": request_keys}}
+            )
+            for item in response["Responses"].get("PubMedArticle", []):
+                pmid = item.get("pmid")
+                if pmid is not None:
+                    present.add(pmid)
+                    results[pmid] = get_abstract(item)
+            request_keys = (
+                response.get("UnprocessedKeys", {})
+                .get("PubMedArticle", {})
+                .get("Keys", [])
+            )
+            if request_keys:
+                attempt += 1
+                if attempt > MAX_UNPROCESSED_RETRIES:
+                    logger.warning(
+                        f"{len(request_keys)} keys still unprocessed after "
+                        f"{MAX_UNPROCESSED_RETRIES} retries; skipping remainder."
+                    )
+                    break
+                time.sleep(min(0.1 * (2 ** attempt), 5.0))
+        return results, present
+
+    chunks = [pmids[i:i + CHUNK_SIZE] for i in range(0, len(pmids), CHUNK_SIZE)]
+    all_results = {}
+    found = set()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
+        futures = [ex.submit(fetch_chunk, c) for c in chunks]
+        for f in concurrent.futures.as_completed(futures):
+            res, present = f.result()
+            all_results.update(res)
+            found.update(present)
+    return all_results, found
+
+
+def classify(db_abs, dyn_abs, dyn_present):
+    if not dyn_present:
+        return "MISSING_IN_DYNAMO"
+    if not dyn_abs:
+        return "EMPTY_IN_DYNAMO"
+
+    db_norm = db_abs.strip()
+    dyn_norm = dyn_abs.strip()
+    if db_norm == dyn_norm:
+        return "CLEAN"
+
+    db_len = len(db_norm)
+    dyn_len = len(dyn_norm)
+
+    # Allow tiny tail differences (trailing whitespace/punctuation, an
+    # extra character or two) without flagging as corruption.
+    if abs(db_len - dyn_len) <= 5 and db_norm[: min(db_len, dyn_len) - 5 if db_len > 5 else db_len].lstrip('"') == dyn_norm[: min(db_len, dyn_len) - 5 if db_len > 5 else dyn_len].lstrip('"'):
+        return "CLEAN"
+
+    head_sample = dyn_norm[:HEAD_SAMPLE]
+    if head_sample and head_sample in db_norm[:HEAD_SEARCH_WINDOW]:
+        if db_len > dyn_len * LENGTH_INFLATION_RATIO:
+            return "PREFIX_CORRUPTED"
+        return "CLEAN"
+
+    if db_len >= BLOB_CAP_THRESHOLD and db_len > dyn_len * BLOB_CAP_INFLATION_RATIO:
+        return "PREFIX_CORRUPTED"
+
+    return "DISJOINT"
+
+
+def safe_oneline(s, n):
+    return s[:n].replace("\n", " ").replace("\t", " ").replace("\r", " ")
+
+
+def main():
+    logger.info(
+        f"Audit: LENGTH(abstract) >= {LENGTH_THRESHOLD}; "
+        f"max candidates: {MAX_CANDIDATES}"
+    )
+    conn = connect_mysql()
+    try:
+        candidates = fetch_candidates(conn, LENGTH_THRESHOLD, MAX_CANDIDATES)
+    finally:
+        conn.close()
+
+    logger.info(f"Candidates from reporting_abstracts: {len(candidates)}")
+    if not candidates:
+        logger.info("Nothing above threshold; exiting.")
+        return
+
+    lens = sorted(c["db_len"] for c in candidates)
+    logger.info(
+        f"DB length distribution: min={lens[0]} "
+        f"p50={lens[len(lens) // 2]} p95={lens[int(len(lens) * 0.95)]} "
+        f"max={lens[-1]}"
+    )
+
+    pmids = [c["pmid"] for c in candidates]
+    dyn_abstracts, dyn_present = fetch_abstracts_from_dynamo(pmids)
+    logger.info(
+        f"DynamoDB returned records for {len(dyn_present)} / {len(pmids)} PMIDs"
+    )
+
+    rows = []
+    counters = {
+        "CLEAN": 0,
+        "PREFIX_CORRUPTED": 0,
+        "DISJOINT": 0,
+        "MISSING_IN_DYNAMO": 0,
+        "EMPTY_IN_DYNAMO": 0,
+    }
+    for c in candidates:
+        pmid = c["pmid"]
+        db_abs = c["abstract"]
+        present = pmid in dyn_present
+        dyn_abs = dyn_abstracts.get(pmid, "")
+        verdict = classify(db_abs, dyn_abs, present)
+        counters[verdict] += 1
+        rows.append({
+            "pmid": pmid,
+            "db_len": c["db_len"],
+            "dyn_len": len(dyn_abs) if present else "",
+            "verdict": verdict,
+            "db_head": safe_oneline(db_abs, 80),
+            "db_tail": safe_oneline(db_abs[-80:] if len(db_abs) >= 80 else db_abs, 80),
+            "dyn_head": safe_oneline(dyn_abs, 80) if present else "",
+        })
+
+    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
+        writer.writeheader()
+        writer.writerows(rows)
+    logger.info(f"Per-row audit written to {OUTPUT_CSV}")
+
+    logger.info("Verdict counts:")
+    for k in ("CLEAN", "PREFIX_CORRUPTED", "DISJOINT",
+              "MISSING_IN_DYNAMO", "EMPTY_IN_DYNAMO"):
+        logger.info(f"  {k:18s} {counters[k]}")
+
+    suspect = [r for r in rows if r["verdict"] in ("PREFIX_CORRUPTED", "DISJOINT")]
+    suspect.sort(key=lambda r: r["db_len"], reverse=True)
+
+    if suspect:
+        with open(DUMP_FILE, "w", encoding="utf-8") as f:
+            for r in suspect[:DUMP_TOP_N]:
+                pmid = r["pmid"]
+                db_abs = next(c["abstract"] for c in candidates if c["pmid"] == pmid)
+                dyn_abs = dyn_abstracts.get(pmid, "")
+                f.write("=" * 80 + "\n")
+                f.write(
+                    f"pmid={pmid} verdict={r['verdict']} "
+                    f"db_len={r['db_len']} dyn_len={r['dyn_len']}\n"
+                )
+                f.write("--- DB (full) ---\n")
+                f.write(db_abs + "\n")
+                f.write("--- Dynamo (full) ---\n")
+                f.write(dyn_abs + "\n\n")
+        logger.info(f"Top {DUMP_TOP_N} suspects dumped to {DUMP_FILE}")
+
+        logger.info(f"Top {min(DUMP_TOP_N, len(suspect))} suspects (summary):")
+        for r in suspect[:DUMP_TOP_N]:
+            logger.info(
+                f"  pmid={r['pmid']:>9} verdict={r['verdict']:17s} "
+                f"db_len={r['db_len']:>6} dyn_len={r['dyn_len']}"
+            )
+            logger.info(f"    db_head  : {r['db_head']!r}")
+            logger.info(f"    db_tail  : {r['db_tail']!r}")
+            logger.info(f"    dyn_head : {r['dyn_head']!r}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/update/repairAbstracts.py b/update/repairAbstracts.py
new file mode 100644
index 0000000..30f20aa
--- /dev/null
+++ b/update/repairAbstracts.py
@@ -0,0 +1,339 @@
+"""
+repairAbstracts.py -- one-shot cleanup of reporting_abstracts rows flagged
+as corrupted by update/auditAbstracts.py.
+
+Reads audit_abstracts.csv (the audit output) and:
+  1. Backs up the affected rows to reporting_abstracts_corrupt_backup_<ts>.
+  2. Deletes the corrupted rows from reporting_abstracts in batches.
+  3. Dedupes any remaining pmids that have multiple rows by keeping the
+     row with MIN(id) and backing up the rest to the same backup table.
+     (Precondition for the v1.4 UNIQUE KEY migration.)
+  4. Verifies post-state row counts and confirms no duplicate pmids remain.
+
+After this script runs, the next nightly update/abstractImport.py will
+re-fetch the deleted PMIDs cleanly via the parameterized executemany
+path introduced in PR #78.
+
+Destructive. Requires --apply to perform the delete; without --apply it
+runs in dry-run mode (counts only, no writes).
+
+Env:
+  DB_USERNAME, DB_PASSWORD, DB_HOST, DB_NAME
+"""
+
+import argparse
+import csv
+import datetime
+import logging
+import os
+import re
+import sys
+
+import pymysql.cursors
+import pymysql.err
+
+
+INVALID_VERDICTS = {"PREFIX_CORRUPTED", "DISJOINT", "EMPTY_IN_DYNAMO"}
+DEFAULT_AUDIT_CSV = "audit_abstracts.csv"
+DEFAULT_BATCH_SIZE = 500
+
+# Identifier safety: the backup-table suffix is timestamp-derived, but
+# allow callers to override with --backup-table; whitelist the shape to
+# refuse anything that would require quoting.
+SAFE_IDENT = re.compile(r"^[A-Za-z_][A-Za-z0-9_]{0,63}$")
+
+logging.basicConfig(
+    stream=sys.stdout,
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+
+def connect_mysql():
+    try:
+        return pymysql.connect(
+            user=os.getenv("DB_USERNAME"),
+            password=os.getenv("DB_PASSWORD"),
+            database=os.getenv("DB_NAME"),
+            host=os.getenv("DB_HOST"),
+            autocommit=True,
+            charset="utf8mb4",
+            cursorclass=pymysql.cursors.DictCursor,
+        )
+    except pymysql.err.MySQLError as err:
+        logger.error(f"DB connection failed: {err}")
+        sys.exit(1)
+
+
+def read_invalid_pmids(audit_csv):
+    if not os.path.exists(audit_csv):
+        logger.error(f"Audit CSV not found: {audit_csv}")
+        logger.error("Run update/auditAbstracts.py first.")
+        sys.exit(1)
+    with open(audit_csv) as f:
+        rows = list(csv.DictReader(f))
+    if not rows or "verdict" not in rows[0] or "pmid" not in rows[0]:
+        logger.error(f"{audit_csv} is missing required columns (pmid, verdict).")
+        sys.exit(1)
+    return sorted({
+        int(r["pmid"]) for r in rows if r["verdict"] in INVALID_VERDICTS
+    })
+
+
+def count_matching(cur, pmids, batch=5000):
+    """COUNT(*) of rows whose pmid is in `pmids`, batched to avoid
+    oversized IN-lists. Returns the sum across batches."""
+    total = 0
+    for i in range(0, len(pmids), batch):
+        chunk = pmids[i:i + batch]
+        placeholders = ",".join(["%s"] * len(chunk))
+        cur.execute(
+            f"SELECT COUNT(*) AS c FROM reporting_abstracts "
+            f"WHERE pmid IN ({placeholders})",
+            chunk,
+        )
+        total += cur.fetchone()["c"]
+    return total
+
+
+def backup_rows(cur, pmids, backup_table, batch):
+    cur.execute(f"CREATE TABLE `{backup_table}` LIKE reporting_abstracts")
+    inserted = 0
+    for i in range(0, len(pmids), batch):
+        chunk = pmids[i:i + batch]
+        placeholders = ",".join(["%s"] * len(chunk))
+        cur.execute(
+            f"INSERT INTO `{backup_table}` "
+            f"SELECT * FROM reporting_abstracts WHERE pmid IN ({placeholders})",
+            chunk,
+        )
+        inserted += cur.rowcount
+        if (i // batch) % 5 == 0:
+            logger.info(f"  ... backed up {inserted:,} rows")
+    return inserted
+
+
+def delete_rows(cur, pmids, batch):
+    deleted = 0
+    for i in range(0, len(pmids), batch):
+        chunk = pmids[i:i + batch]
+        placeholders = ",".join(["%s"] * len(chunk))
+        cur.execute(
+            f"DELETE FROM reporting_abstracts WHERE pmid IN ({placeholders})",
+            chunk,
+        )
+        deleted += cur.rowcount
+        if (i // batch) % 5 == 0:
+            logger.info(f"  ... deleted {deleted:,} rows")
+    return deleted
+
+
+def find_duplicate_pmids(cur, limit=10):
+    cur.execute(
+        "SELECT pmid, COUNT(*) AS c FROM reporting_abstracts "
+        "GROUP BY pmid HAVING c > 1 LIMIT %s",
+        (limit,),
+    )
+    return cur.fetchall()
+
+
+def count_duplicate_extras(cur):
+    """Returns (group_count, extra_row_count). extra_row_count is the number
+    of rows that would need to be deleted to leave one row per pmid."""
+    cur.execute(
+        "SELECT COUNT(*) AS groups, COALESCE(SUM(c - 1), 0) AS extras FROM ("
+        "  SELECT COUNT(*) AS c FROM reporting_abstracts GROUP BY pmid HAVING c > 1"
+        ") d"
+    )
+    r = cur.fetchone()
+    return r["groups"], r["extras"]
+
+
+def backup_duplicate_extras(cur, backup_table):
+    """Insert into the backup table every duplicate row except the MIN(id)
+    keeper for each pmid. Returns the number of rows backed up."""
+    cur.execute(
+        f"INSERT INTO `{backup_table}` "
+        "SELECT ra.* FROM reporting_abstracts ra "
+        "JOIN ("
+        "  SELECT pmid, MIN(id) AS keep_id FROM reporting_abstracts "
+        "  GROUP BY pmid HAVING COUNT(*) > 1"
+        ") k ON k.pmid = ra.pmid AND ra.id <> k.keep_id"
+    )
+    return cur.rowcount
+
+
+def delete_duplicate_extras(cur):
+    """Delete every duplicate row except the MIN(id) keeper for each pmid.
+    Returns the number of rows deleted."""
+    cur.execute(
+        "DELETE ra FROM reporting_abstracts ra "
+        "JOIN ("
+        "  SELECT pmid, MIN(id) AS keep_id FROM reporting_abstracts "
+        "  GROUP BY pmid HAVING COUNT(*) > 1"
+        ") k ON k.pmid = ra.pmid AND ra.id <> k.keep_id"
+    )
+    return cur.rowcount
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--audit-csv", default=DEFAULT_AUDIT_CSV,
+                        help=f"Audit CSV path (default {DEFAULT_AUDIT_CSV})")
+    parser.add_argument("--apply", action="store_true",
+                        help="Perform the delete. Without this flag, dry-run only.")
+    parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE,
+                        help=f"PMIDs per statement (default {DEFAULT_BATCH_SIZE})")
+    parser.add_argument("--backup-table", default=None,
+                        help="Backup table name (default: reporting_abstracts_corrupt_backup_<ts>)")
+    args = parser.parse_args()
+
+    pmids = read_invalid_pmids(args.audit_csv)
+    logger.info(f"Read {len(pmids):,} invalid PMIDs from {args.audit_csv}")
+    if not pmids:
+        logger.info("Nothing to repair.")
+        return
+
+    backup_table = args.backup_table or (
+        f"reporting_abstracts_corrupt_backup_"
+        f"{datetime.datetime.now():%Y%m%d_%H%M%S}"
+    )
+    if not SAFE_IDENT.match(backup_table):
+        logger.error(f"Refusing unsafe backup-table identifier: {backup_table!r}")
+        sys.exit(1)
+
+    conn = connect_mysql()
+    try:
+        with conn.cursor() as cur:
+            cur.execute("SELECT COUNT(*) AS c FROM reporting_abstracts")
+            before_total = cur.fetchone()["c"]
+
+            matching = count_matching(cur, pmids)
+            logger.info(
+                f"reporting_abstracts: {before_total:,} rows total; "
+                f"{matching:,} rows match the invalid-PMID list."
+            )
+
+            if matching > len(pmids):
+                logger.info(
+                    f"Live matches ({matching:,}) > unique PMIDs ({len(pmids):,}): "
+                    f"{matching - len(pmids):,} of the audited PMIDs have multiple "
+                    "rows in the live table (all of which will be deleted by the IN clause)."
+                )
+            elif matching < len(pmids):
+                logger.warning(
+                    f"Live matches ({matching:,}) < unique PMIDs ({len(pmids):,}): "
+                    f"{len(pmids) - matching:,} audited PMIDs no longer present "
+                    "(already deleted or table changed). Proceeding with what is live."
+                )
+
+            dupe_groups, dupe_extras = count_duplicate_extras(cur)
+            logger.info(
+                f"Duplicate-pmid groups: {dupe_groups:,} "
+                f"({dupe_extras:,} extra rows would be deduped after the corruption delete)."
+            )
+
+            if not args.apply:
+                cur.execute(
+                    "SELECT pmid, LENGTH(abstract) AS db_len FROM reporting_abstracts "
+                    "WHERE LENGTH(abstract) >= 4000 ORDER BY LENGTH(abstract) DESC LIMIT 3"
+                )
+                samples = cur.fetchall()
+                logger.info("Sample of longest current rows (pre-repair):")
+                for s in samples:
+                    logger.info(f"  pmid={s['pmid']:>9} db_len={s['db_len']}")
+                logger.info(f"Would back up to: `{backup_table}`")
+                logger.info(
+                    f"Would delete {matching:,} corrupted rows + dedupe "
+                    f"{dupe_extras:,} duplicate-extras (keep MIN(id) per pmid)."
+                )
+                logger.info("DRY RUN -- no changes made. Re-run with --apply to perform the repair.")
+                return
+
+            logger.info(f"Creating backup table `{backup_table}` ...")
+            backed_up = backup_rows(cur, pmids, backup_table, args.batch_size)
+            logger.info(f"Backed up {backed_up:,} rows to `{backup_table}`.")
+            if backed_up != matching:
+                logger.error(
+                    f"Backup row count {backed_up:,} != expected {matching:,}. Aborting."
+                )
+                sys.exit(1)
+
+            logger.info("Deleting corrupted rows from reporting_abstracts ...")
+            deleted = delete_rows(cur, pmids, args.batch_size)
+
+            cur.execute("SELECT COUNT(*) AS c FROM reporting_abstracts")
+            after_total = cur.fetchone()["c"]
+            logger.info(
+                f"Deleted {deleted:,} rows; reporting_abstracts now has "
+                f"{after_total:,} rows (was {before_total:,})."
+            )
+            if before_total - after_total != deleted:
+                logger.error(
+                    f"Row-count delta mismatch: before-after={before_total - after_total}, "
+                    f"deleted={deleted}. Backup table `{backup_table}` is intact."
+                )
+                sys.exit(1)
+
+            cur.execute(
+                "SELECT COUNT(*) AS c FROM reporting_abstracts WHERE LENGTH(abstract) >= 4000"
+            )
+            long_remaining = cur.fetchone()["c"]
+            logger.info(
+                f"Rows with LENGTH(abstract) >= 4000 remaining: {long_remaining:,} "
+                "(should approximately equal the CLEAN count from the audit)."
+            )
+
+            cur.execute(
+                "SELECT COUNT(*) AS c FROM reporting_abstracts WHERE LENGTH(abstract) >= 60000"
+            )
+            cap_remaining = cur.fetchone()["c"]
+            logger.info(
+                f"Rows at/above 60K (BLOB-cap region) remaining: {cap_remaining:,} "
+                "(should be 0 if repair caught all corruption)."
+            )
+
+            dupe_groups_after, dupe_extras_after = count_duplicate_extras(cur)
+            if dupe_extras_after > 0:
+                logger.info(
+                    f"Phase 2: deduping {dupe_extras_after:,} extra rows across "
+                    f"{dupe_groups_after:,} pmid groups (keeping MIN(id) per pmid)..."
+                )
+                backed_up_dupes = backup_duplicate_extras(cur, backup_table)
+                logger.info(f"  ... backed up {backed_up_dupes:,} duplicate rows to `{backup_table}`.")
+                if backed_up_dupes != dupe_extras_after:
+                    logger.error(
+                        f"Dedup backup count {backed_up_dupes:,} != expected {dupe_extras_after:,}. "
+                        "Aborting before delete."
+                    )
+                    sys.exit(1)
+                deleted_dupes = delete_duplicate_extras(cur)
+                logger.info(f"  ... deleted {deleted_dupes:,} duplicate rows.")
+                if deleted_dupes != dupe_extras_after:
+                    logger.error(
+                        f"Dedup delete count {deleted_dupes:,} != expected {dupe_extras_after:,}. "
+                        f"Backup table `{backup_table}` is intact."
+                    )
+                    sys.exit(1)
+            else:
+                logger.info("Phase 2: no duplicates to dedupe.")
+
+            dupes = find_duplicate_pmids(cur)
+            if dupes:
+                logger.error(
+                    f"{len(dupes)} duplicate pmid(s) still present after dedup (sample): "
+                    f"{[(d['pmid'], d['c']) for d in dupes]}"
+                )
+                sys.exit(1)
+            else:
+                logger.info(
+                    "No duplicate pmids remain. Safe to apply "
+                    "setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql."
+                )
+    finally:
+        conn.close()
+
+
+if __name__ == "__main__":
+    main()

From 8e82e7a0c8b65d0295103a69e0bb2c983502f507 Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Wed, 20 May 2026 17:53:55 -0400
Subject: [PATCH 14/19] fix: repair script handles generated columns; migration
 guard actually halts

Two bugs discovered when running the runbook end to end against prod:

1. update/repairAbstracts.py used `CREATE TABLE LIKE` + `INSERT INTO ...
   SELECT *`. Prod added a STORED generated column `abstract_len` (with
   a composite index `idx_abs_pmid_len`) directly without updating the
   repo's schema file. The SELECT * pulled the generated-column value;
   MariaDB's strict mode rejected it with error 1906 and the backup
   aborted with the destination table empty.

   The fix queries information_schema for non-generated columns and
   enumerates them in both the corrupted-rows INSERT and the dedup
   INSERT. The script now works whether or not abstract_len exists.

2. setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql's precondition
   guard used `SELECT 'aborted...' AS error, 1/0 AS force_error`. The
   1/0 only emits a warning in MariaDB's default SQL mode -- it does
   NOT halt the script. So when the DBA uploaded the migration before
   running the repair, the precondition SELECT was printed but
   execution continued to the ALTER, which then failed with
   `Duplicate entry '9182809' for key 'idx_pmid'`.

   The fix synthesizes a SELECT against a non-existent table whose
   name encodes the duplicate count
   (`__migration_aborted_reporting_abstracts_has_N_duplicate_pmids__
   _run_update_repairAbstracts_py_first`). The resulting "Table
   doesn't exist" error halts the SQL client immediately, and the
   error text itself tells the operator what to do.

Schema drift (the live abstract_len + idx_abs_pmid_len that aren't in
createDatabaseTableReciterDb.sql) is a separate concern not addressed
here.
---
 ...r_add_uq_pmid_reporting_abstracts_v1.4.sql | 14 ++++++----
 update/repairAbstracts.py                     | 27 ++++++++++++++++---
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql b/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql
index a6e3211..abff544 100644
--- a/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql
+++ b/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql
@@ -29,6 +29,13 @@ SET @db = DATABASE();
 
 -- -----------------------------------------------------------------------------
 -- Precondition: no duplicate pmids.
+--
+-- If duplicates exist, the precondition synthesizes a SELECT against a
+-- non-existent table whose name encodes the duplicate count. The resulting
+-- "Table doesn't exist" error halts execution (SELECT-with-1/0 only emits
+-- a warning, which MariaDB ignored outside a stored program in v1 of this
+-- migration -- the cleanup was attempted, ALTER ran anyway, ALTER failed
+-- on the first duplicate pmid).
 -- -----------------------------------------------------------------------------
 
 SET @dup_count = (
@@ -41,12 +48,9 @@ SET @dup_count = (
 SET @sql = IF(
     @dup_count > 0,
     CONCAT(
-        'SELECT ',
-        '''Migration aborted: reporting_abstracts has ',
+        'SELECT 1 FROM `__migration_aborted_reporting_abstracts_has_',
         @dup_count,
-        ' duplicate pmid value(s). Run update/repairAbstracts.py and resolve ',
-        'duplicates before re-running this migration.'' AS error, ',
-        '1/0 AS force_error'
+        '_duplicate_pmids__run_update_repairAbstracts_py_first`'
     ),
     'SELECT ''No duplicate pmids; precondition satisfied.'' AS status'
 );
diff --git a/update/repairAbstracts.py b/update/repairAbstracts.py
index 30f20aa..3f9466d 100644
--- a/update/repairAbstracts.py
+++ b/update/repairAbstracts.py
@@ -97,15 +97,31 @@ def count_matching(cur, pmids, batch=5000):
     return total
 
 
+def writable_columns(cur, table="reporting_abstracts"):
+    """Return the list of non-generated columns (those that accept INSERT).
+    Prod has a STORED generated column abstract_len that cannot be assigned;
+    INSERT must enumerate the real columns explicitly."""
+    cur.execute(
+        "SELECT column_name FROM information_schema.columns "
+        "WHERE table_schema = DATABASE() AND table_name = %s "
+        "  AND (extra IS NULL OR extra NOT LIKE '%%GENERATED%%') "
+        "ORDER BY ordinal_position",
+        (table,),
+    )
+    return [r["column_name"] for r in cur.fetchall()]
+
+
 def backup_rows(cur, pmids, backup_table, batch):
     cur.execute(f"CREATE TABLE `{backup_table}` LIKE reporting_abstracts")
+    cols = writable_columns(cur)
+    col_list = ", ".join(f"`{c}`" for c in cols)
     inserted = 0
     for i in range(0, len(pmids), batch):
         chunk = pmids[i:i + batch]
         placeholders = ",".join(["%s"] * len(chunk))
         cur.execute(
-            f"INSERT INTO `{backup_table}` "
-            f"SELECT * FROM reporting_abstracts WHERE pmid IN ({placeholders})",
+            f"INSERT INTO `{backup_table}` ({col_list}) "
+            f"SELECT {col_list} FROM reporting_abstracts WHERE pmid IN ({placeholders})",
             chunk,
         )
         inserted += cur.rowcount
@@ -153,9 +169,12 @@ def count_duplicate_extras(cur):
 def backup_duplicate_extras(cur, backup_table):
     """Insert into the backup table every duplicate row except the MIN(id)
     keeper for each pmid. Returns the number of rows backed up."""
+    cols = writable_columns(cur)
+    col_list = ", ".join(f"`{c}`" for c in cols)
+    select_list = ", ".join(f"ra.`{c}`" for c in cols)
     cur.execute(
-        f"INSERT INTO `{backup_table}` "
-        "SELECT ra.* FROM reporting_abstracts ra "
+        f"INSERT INTO `{backup_table}` ({col_list}) "
+        f"SELECT {select_list} FROM reporting_abstracts ra "
         "JOIN ("
         "  SELECT pmid, MIN(id) AS keep_id FROM reporting_abstracts "
         "  GROUP BY pmid HAVING COUNT(*) > 1"

From b3eaf6d25c105d62f2c53c95360451b944a8f54e Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Mon, 8 Jun 2026 19:31:22 -0400
Subject: [PATCH 15/19] feat(setup): add durable authorship_review table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Durable curator-state table for the Publication Manager 'Authorships' review queue
(Curator_All). Survives the nightly truncate-reload like grant_provenance — it is in
no truncate list (updateReciterDB.py all_tables) and touched by no stored procedure;
populated externally by the adversarial-attribution-review producer. CREATE TABLE IF
NOT EXISTS, additive, no change to existing ETL.
---
 setup/table_authorship_review.sql | 82 +++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 setup/table_authorship_review.sql

diff --git a/setup/table_authorship_review.sql b/setup/table_authorship_review.sql
new file mode 100644
index 0000000..366986c
--- /dev/null
+++ b/setup/table_authorship_review.sql
@@ -0,0 +1,82 @@
+-- -----------------------------------------------------------------------------
+-- authorship_review — Publication Manager "Authorships" review queue
+-- -----------------------------------------------------------------------------
+-- DURABLE TABLE — survives the nightly truncate-reload. Like `grant_provenance`
+-- (the (person,pmid,grant) audit log) and the `admin_*` tables, this is curator
+-- state, NOT a reporting export. It MUST NOT be added to any truncate list
+-- (see update/updateReciterDB.py `all_tables`) and is not touched by any nightly
+-- stored procedure or ETL step. CREATE TABLE IF NOT EXISTS so re-applying is safe.
+--
+-- One row per WCM-affiliated AUTHORSHIP (a PubMed author carrying a WCM affiliation
+-- on a publication) that is NOT yet assigned to any identity. Powers the Curator_All
+-- `/authorships` tab in ReCiter-Publication-Manager (reads this table via Sequelize).
+--
+-- POPULATED EXTERNALLY (this repo's ETL cannot compute the scores). The producer is
+-- the adversarial-attribution-review pipeline in the ReCiter Research project
+-- (scripts/aar_orchestrator.py -> aar_db.py upsert), which runs the gate (reciterdb
+-- analysis_summary_author = accepted set), the identity matcher (reciterdb identity),
+-- and the pinned XGBoost 3.2.0 models over the S3 scoring inputs to compute the
+-- feedback-identity (FG) and identity-only (IO) scores per authorship. Monthly cron.
+--
+-- Classification per authorship (the producer sets it):
+--   absent     top candidate never scored by production (no person_article row)
+--   suggested  top candidate production final (FG) >= 30 — already in a pending queue
+--   buried     top candidate FG < 30 (IO can be high) — production buried it
+--   assigned   reserved (accepted rows are excluded by the gate, not stored here)
+--
+-- single_candidate = exactly one WCM identity matches the author's surname +
+-- given/initial (cohort_size == 1) — the strongest precision signal; such rows are
+-- near-certain and form the high-precision review lane.
+--
+-- Refresh contract: the producer UPSERTs by author_key, refreshing the scoring/
+-- classification columns and `last_refreshed`; it NEVER overwrites a curator-set
+-- `status` (assigned/accepted/rejected/dismissed/snoozed) or its resolution_cwid/
+-- reviewer/note/snooze_until, and `first_seen` is set once and never overwritten.
+-- -----------------------------------------------------------------------------
+
+CREATE TABLE IF NOT EXISTS `authorship_review` (
+  `id`                     BIGINT       NOT NULL AUTO_INCREMENT,
+  `pmid`                   BIGINT       NOT NULL,
+  `author_key`             VARCHAR(32)  NOT NULL,            -- `pmid:position`
+  `author_position`        INT          NULL,
+  `author_position_label`  VARCHAR(8)   NULL,                -- first/middle/last
+  `wcm_author`             VARCHAR(255) NULL,                -- PubMed author name
+  `author_affiliation`     TEXT         NULL,
+  `entrez_date`            DATE         NULL,                -- ReCiter entrez add date
+  `title`                  TEXT         NULL,
+  `journal`                VARCHAR(512) NULL,
+  `doi`                    VARCHAR(255) NULL,
+  `classification`         ENUM('assigned','suggested','buried','absent') NULL,
+  `top_cwid`               VARCHAR(32)  NULL,                -- proposed identity
+  `top_name`               VARCHAR(255) NULL,
+  `top_person_type`        VARCHAR(64)  NULL,
+  `top_dept`               VARCHAR(255) NULL,
+  `top_fg_score`           FLOAT        NULL,                -- production final (FG)
+  `top_io_score`           FLOAT        NULL,                -- identity-only (IO)
+  `top_confidence`         FLOAT        NULL,
+  `top_cohort_size`        INT          NULL,                -- homonyms (surname+initial)
+  `top_given_match`        VARCHAR(16)  NULL,                -- full|initial
+  `top_affil_match`        TINYINT(1)   NULL,
+  `n_candidates`           INT          NULL,
+  `single_candidate`       TINYINT(1)   NULL,                -- cohort_size == 1
+  `candidate_cwids_json`   LONGTEXT     NULL,                -- ranked alternates
+  `status`                 ENUM('open','assigned','accepted','rejected','dismissed','snoozed')
+                                        NOT NULL DEFAULT 'open',   -- curator state
+  `resolution_cwid`        VARCHAR(32)  NULL,
+  `reviewer`               VARCHAR(64)  NULL,
+  `note`                   TEXT         NULL,
+  `snooze_until`           DATE         NULL,
+  `resolved_at`            DATETIME     NULL,
+  `first_seen`             DATETIME     NULL,                -- set once, never overwritten
+  `last_refreshed`         DATETIME     NULL,
+  `last_checked`           DATETIME     NULL,
+  PRIMARY KEY (`id`),
+  UNIQUE KEY `uq_author_key` (`author_key`),
+  KEY `ix_pmid` (`pmid`),
+  KEY `ix_classification` (`classification`),
+  KEY `ix_status` (`status`),
+  KEY `ix_single_candidate` (`single_candidate`),
+  KEY `ix_top_io_score` (`top_io_score`),
+  KEY `ix_entrez_date` (`entrez_date`),
+  KEY `ix_top_cwid` (`top_cwid`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;

From b6f97dcc497906dcf73a68a693a75f9d507c6b5d Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Tue, 9 Jun 2026 07:56:56 -0400
Subject: [PATCH 16/19] feat(setup): add admin_users scope/proxy column
 migration (v1.5)

Adds scope_person_types, scope_org_units, proxy_person_ids (JSON NULL)
to admin_users via an idempotent information_schema-guarded ALTER, for
existing databases that predate the fresh-build schema (#92, master).

The Publication Manager dev-branch AdminUser model (commit 579d32f)
selects these columns during login; without them findOrcreateAdminUser
fails with 'Unknown column' and authentication returns 401 for every
user. Run before deploying the PM dev branch against an existing
reciterdb (e.g. production).

Additive only; admin_users is durable (not in updateReciterDB.py
all_tables truncate list). No-op on re-run.
---
 ...lter_add_admin_user_scope_columns_v1.5.sql | 76 +++++++++++++++++++
 1 file changed, 76 insertions(+)
 create mode 100644 setup/alter_add_admin_user_scope_columns_v1.5.sql

diff --git a/setup/alter_add_admin_user_scope_columns_v1.5.sql b/setup/alter_add_admin_user_scope_columns_v1.5.sql
new file mode 100644
index 0000000..98e4877
--- /dev/null
+++ b/setup/alter_add_admin_user_scope_columns_v1.5.sql
@@ -0,0 +1,76 @@
+-- =============================================================================
+-- Migration: Add admin_users scope/proxy columns (v1.5)
+-- =============================================================================
+-- Adds the three JSON scope columns the Publication Manager AdminUser model
+-- now selects on every login:
+--   - scope_person_types   (JSON, nullable) — person-type curation scope
+--   - scope_org_units       (JSON, nullable) — org-unit curation scope
+--   - proxy_person_ids      (JSON, nullable) — proxied person identifiers
+--
+-- WHY THIS MIGRATION EXISTS:
+--   ReCiter-Publication-Manager (dev branch, model commit 579d32f
+--   "extend AdminUser model with scope/proxy JSON columns") issues
+--     SELECT userID, personIdentifier, ..., scope_person_types,
+--            scope_org_units, proxy_person_ids FROM admin_users
+--   inside findOrcreateAdminUser during authentication. If admin_users is
+--   missing these columns the SELECT fails with ER_BAD_FIELD_ERROR
+--   ("Unknown column 'scope_person_types'"), the authorize() call throws, and
+--   login returns 401 for every user. The columns must exist before the PM
+--   dev branch is deployed against this database.
+--
+--   The fresh-build schema (setup/createDatabaseTableReciterDb.sql on master,
+--   PR #92) already defines admin_users WITH these columns, so new databases
+--   are fine. This migration brings EXISTING databases (e.g. the production
+--   reciterdb, which predates #92 and has none of the three) up to that
+--   schema. There was no ALTER path for existing DBs until now.
+--
+-- DURABILITY: admin_users is curator state, not a reporting export. It is NOT
+--   in update/updateReciterDB.py's truncate list (`all_tables`) and is not
+--   touched by any nightly stored procedure or ETL step, so these columns
+--   persist across nightly reload.
+--
+-- Safe to run on prod and dev. Uses IF NOT EXISTS-style guards via an
+-- information_schema check (no-op on re-run). Additive only — no existing
+-- column or row is modified. Run BEFORE deploying the PM dev branch.
+-- =============================================================================
+
+-- -----------------------------------------------------------------------------
+-- admin_users: + scope_person_types + scope_org_units + proxy_person_ids
+-- -----------------------------------------------------------------------------
+
+SET @db = DATABASE();
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'admin_users'
+       AND column_name = 'scope_person_types') = 0,
+    'ALTER TABLE admin_users ADD COLUMN `scope_person_types` JSON DEFAULT NULL',
+    'SELECT ''admin_users.scope_person_types already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'admin_users'
+       AND column_name = 'scope_org_units') = 0,
+    'ALTER TABLE admin_users ADD COLUMN `scope_org_units` JSON DEFAULT NULL',
+    'SELECT ''admin_users.scope_org_units already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+SET @sql = (SELECT IF(
+    (SELECT COUNT(*) FROM information_schema.columns
+     WHERE table_schema = @db AND table_name = 'admin_users'
+       AND column_name = 'proxy_person_ids') = 0,
+    'ALTER TABLE admin_users ADD COLUMN `proxy_person_ids` JSON DEFAULT NULL',
+    'SELECT ''admin_users.proxy_person_ids already exists'''));
+PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt;
+
+-- -----------------------------------------------------------------------------
+-- Verification
+-- -----------------------------------------------------------------------------
+
+SELECT table_name, column_name, data_type, is_nullable
+FROM information_schema.columns
+WHERE table_schema = DATABASE()
+  AND table_name = 'admin_users'
+  AND column_name IN ('scope_person_types', 'scope_org_units', 'proxy_person_ids')
+ORDER BY ordinal_position;

From c856c2c7c027b06c21cf29d33120db82db9a2318 Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Sun, 14 Jun 2026 12:40:21 -0400
Subject: [PATCH 17/19] feat(etl): scan ArticleProvenance (DynamoDB) ->
 article_provenance table (#95)

Add a nightly ETL step that scans the ReCiter DynamoDB ArticleProvenance
table and loads first-retrieval provenance into a new reciterdb table,
article_provenance, so Publication Manager can display the date a
publication was first retrieved (PM #737).

- update/retrieveArticleProvenance.py: streams the scan into an
  article_provenance_new staging table (INSERT IGNORE per page to bound
  memory and collapse duplicates), validates row counts against
  production, then atomic RENAME-swaps. Converts frd (epoch seconds, UTC)
  to DATETIME. Mirrors the retrieveNIH staging->swap pattern.
- setup/createDatabaseTableReciterDb.sql: article_provenance DDL for
  fresh installs, keyed (pmid, personIdentifier).
- setup/alter_add_article_provenance_v1.6.sql: migration for existing
  dev/prod databases.
- update/run_all.py: run the step before nightly indexing as non-fatal so
  a failure cannot block the indexing SP.
- Dockerfile: COPY the new script.
- README: document the step and backfill the missing retrieveReporter row.

The table is keyed on (pmid, personIdentifier): the source DynamoDB table
has a composite key uid (HASH) + articleId (RANGE), one item per
person+article, so frd is per-person -- not global-per-article as issue
#95 assumed.
---
 Dockerfile                                  |   1 +
 README.md                                   |  13 +-
 setup/alter_add_article_provenance_v1.6.sql |  56 ++++
 setup/createDatabaseTableReciterDb.sql      |  17 +
 update/retrieveArticleProvenance.py         | 353 ++++++++++++++++++++
 update/run_all.py                           |  24 +-
 6 files changed, 453 insertions(+), 11 deletions(-)
 create mode 100644 setup/alter_add_article_provenance_v1.6.sql
 create mode 100644 update/retrieveArticleProvenance.py

diff --git a/Dockerfile b/Dockerfile
index f4f344b..a388a25 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,6 +14,7 @@ ENV PYTHONUNBUFFERED=1
 # Copy additional Python scripts
 COPY update/retrieveNIH.py ./
 COPY update/retrieveReporter.py ./
+COPY update/retrieveArticleProvenance.py ./
 COPY update/retrieveAltmetric.py ./
 COPY update/retrieveArticles.py ./
 COPY update/updateReciterDB.py ./
diff --git a/README.md b/README.md
index c39a9e1..dc22384 100644
--- a/README.md
+++ b/README.md
@@ -69,12 +69,15 @@ CronJob (daily)
      ├─ 1. executeFeatureGenerator.py    Trigger ReCiter ML scoring API
      ├─ 2. retrieveArticles.py           Pull person/article data from S3 + DynamoDB
      ├─ 3. retrieveNIH.py                NIH iCite API → analysis_nih (atomic swap)
-     ├─ 4. run_nightly_indexing.sh        Run populateAnalysisSummaryTables_v2()
+     ├─ 4. retrieveReporter.py           NIH RePORTER grants → grant_reporter_* (reconcile)
+     ├─ 5. retrieveArticleProvenance.py  ArticleProvenance (DynamoDB) → article_provenance
+     │      └─ non-fatal: a failure here does not block nightly indexing
+     ├─ 6. run_nightly_indexing.sh        Run populateAnalysisSummaryTables_v2()
      │      ├─ Polls analysis_job_log every 3s for progress
      │      ├─ Auto-retries 3x with 60s backoff
      │      └─ Auto-restores from backup on failure
-     ├─ 5. abstractImport.py             PubMed abstracts from DynamoDB
-     └─ 6. conflictsImport.py            COI statements from DynamoDB
+     ├─ 7. abstractImport.py             PubMed abstracts from DynamoDB
+     └─ 8. conflictsImport.py            COI statements from DynamoDB
 ```
 
 **Key patterns:**
@@ -105,6 +108,8 @@ ReCiterDB/
 │   ├── run_nightly_indexing.sh                    # SP runner with monitoring/retry
 │   ├── retrieveArticles.py                        # S3 + DynamoDB article fetcher
 │   ├── retrieveNIH.py                             # NIH iCite fetcher (atomic swap)
+│   ├── retrieveReporter.py                        # NIH RePORTER grants fetcher (reconcile)
+│   ├── retrieveArticleProvenance.py               # ArticleProvenance (DynamoDB) → article_provenance
 │   ├── retrieveAltmetric.py                       # Altmetric API fetcher
 │   ├── updateReciterDB.py                         # Bulk loader (LOAD DATA LOCAL INFILE)
 │   ├── dataTransformer.py                         # ReCiter JSON → CSV
@@ -288,6 +293,8 @@ All defined in `setup/createEventsProceduresReciterDb.sql`.
 | `run_all.py` | EKS orchestrator: runs all pipeline steps in sequence with timeout enforcement, memory logging, and S3 log upload |
 | `retrieveArticles.py` | Fetches person and article data from S3 and DynamoDB in batches |
 | `retrieveNIH.py` | Fetches NIH iCite metrics in batches of 150; loads to staging table with validation and atomic swap |
+| `retrieveReporter.py` | Fetches NIH RePORTER grants/linkages and reconciles them into `grant_reporter_*` / `grant_provenance` |
+| `retrieveArticleProvenance.py` | Scans DynamoDB `ArticleProvenance` → `article_provenance` (pmid, personIdentifier) via staging + atomic swap; runs non-fatal |
 | `retrieveAltmetric.py` | Fetches Altmetric scores for articles published in the last 2 years |
 | `updateReciterDB.py` | Bulk data loader using `LOAD DATA LOCAL INFILE` with retry and reconnect logic |
 | `dataTransformer.py` | Transforms ReCiter JSON output to CSV format for all `person_*` tables |
diff --git a/setup/alter_add_article_provenance_v1.6.sql b/setup/alter_add_article_provenance_v1.6.sql
new file mode 100644
index 0000000..206ca9f
--- /dev/null
+++ b/setup/alter_add_article_provenance_v1.6.sql
@@ -0,0 +1,56 @@
+-- =============================================================================
+-- Migration: Add article_provenance table (v1.6)
+-- =============================================================================
+-- Creates the article_provenance table on EXISTING databases (dev, prod). Fresh
+-- builds already get it from setup/createDatabaseTableReciterDb.sql (ReCiterDB#95).
+--
+-- WHAT IT IS:
+--   First-retrieval provenance per (article, person), loaded nightly by
+--   update/retrieveArticleProvenance.py from the ReCiter DynamoDB
+--   `ArticleProvenance` table. That source table has a COMPOSITE key
+--   uid (HASH, personIdentifier) + articleId (RANGE, PMID), so there is one item
+--   per person+article. This table mirrors that key exactly -- PRIMARY KEY
+--   (pmid, personIdentifier) -- rather than collapsing to one row per PMID.
+--
+--   Columns map from DynamoDB attributes:
+--     pmid               <- articleId (String PMID -> INT)
+--     personIdentifier   <- uid
+--     firstRetrievalDate <- frd  (epoch SECONDS, UTC -> DATETIME)
+--     retrievalStrategy  <- rs   (PM_UI_SEARCH, PM_AUTHOR, ...)
+--     source             <- src  (PM, CTSC, GS, MAN, MAN_FROM_PM, ...)
+--
+-- WHY THIS MIGRATION EXISTS:
+--   Publication Manager #737 displays "date a publication was first retrieved"
+--   in /curate and reads this table by PMID. The table must exist before the PM
+--   #737 branch is deployed against this database.
+--
+-- DURABILITY / ETL CONTRACT:
+--   Loaded via a staging table (article_provenance_new) + atomic RENAME swap by
+--   the nightly ETL, exactly like analysis_nih. It is NOT in
+--   update/updateReciterDB.py's truncate list and is not touched by any nightly
+--   stored procedure. A failure in the ETL step leaves production untouched and
+--   does not block the rest of run_all.py (the step runs as non-fatal).
+--
+-- Safe to run on prod and dev. CREATE TABLE IF NOT EXISTS is a no-op on re-run
+-- and additive only -- no existing table or row is modified.
+-- =============================================================================
+
+CREATE TABLE IF NOT EXISTS `article_provenance` (
+  `pmid`               int(11)      NOT NULL,
+  `personIdentifier`   varchar(128) NOT NULL,
+  `firstRetrievalDate` datetime     DEFAULT NULL,
+  `retrievalStrategy`  varchar(64)  DEFAULT NULL,
+  `source`             varchar(32)  DEFAULT NULL,
+  PRIMARY KEY (`pmid`, `personIdentifier`),
+  KEY `idx_personIdentifier` (`personIdentifier`) USING BTREE
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+-- -----------------------------------------------------------------------------
+-- Verification
+-- -----------------------------------------------------------------------------
+
+SELECT table_name, column_name, data_type, character_maximum_length, is_nullable, column_key
+FROM information_schema.columns
+WHERE table_schema = DATABASE()
+  AND table_name = 'article_provenance'
+ORDER BY ordinal_position;
diff --git a/setup/createDatabaseTableReciterDb.sql b/setup/createDatabaseTableReciterDb.sql
index 91c8641..d5127fe 100644
--- a/setup/createDatabaseTableReciterDb.sql
+++ b/setup/createDatabaseTableReciterDb.sql
@@ -492,6 +492,23 @@ CREATE TABLE IF NOT EXISTS `analysis_temp_output_table_cell` (
   KEY `personIdentifier` (`personIdentifier`) USING BTREE
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
 
+-- article_provenance: first-retrieval provenance per (article, person), loaded
+-- nightly by update/retrieveArticleProvenance.py from the ReCiter DynamoDB
+-- `ArticleProvenance` table (composite key uid + articleId). Keyed on
+-- (pmid, personIdentifier) to mirror that source key exactly -- one row per
+-- person+article. firstRetrievalDate is `frd` (epoch seconds, UTC) converted to
+-- DATETIME. Loaded via staging->atomic-swap; NOT in any truncate list.
+-- Consumed by Publication Manager #737 ("date a publication was first retrieved").
+CREATE TABLE IF NOT EXISTS `article_provenance` (
+  `pmid`               int(11)      NOT NULL,
+  `personIdentifier`   varchar(128) NOT NULL,
+  `firstRetrievalDate` datetime     DEFAULT NULL,
+  `retrievalStrategy`  varchar(64)  DEFAULT NULL,
+  `source`             varchar(32)  DEFAULT NULL,
+  PRIMARY KEY (`pmid`, `personIdentifier`),
+  KEY `idx_personIdentifier` (`personIdentifier`) USING BTREE
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
 -- ============================================================================
 -- Journal Tables
 -- ============================================================================
diff --git a/update/retrieveArticleProvenance.py b/update/retrieveArticleProvenance.py
new file mode 100644
index 0000000..ec06254
--- /dev/null
+++ b/update/retrieveArticleProvenance.py
@@ -0,0 +1,353 @@
+# retrieveArticleProvenance.py
+#
+# Nightly ETL step that scans the ReCiter DynamoDB `ArticleProvenance` table and
+# loads it into the reciterdb `article_provenance` table. Powers the Publication
+# Manager "date a publication was first retrieved" display in /curate
+# (wcmc-its/ReCiter-Publication-Manager#737); backend half of ReCiterDB#95.
+#
+# Source table (reciter.service.dynamo.ArticleProvenanceServiceImpl):
+#   - COMPOSITE key: `uid` (HASH, the personIdentifier/CWID) + `articleId` (RANGE,
+#     the PMID as a String). One item per (person, article) pair.
+#   - `frd` = first retrieval date, epoch SECONDS, written with if_not_exists so it
+#     is immutable once set (the first time that person retrieved that article).
+#   - `rs`  = first retrieval strategy (PM_UI_SEARCH, PM_AUTHOR, ...).
+#   - `src` = source (PM, CTSC, GS, MAN, MAN_FROM_PM, ...).
+#   - `ads` = String Set of all strategies seen (not loaded here).
+#
+# reciterdb target is keyed on (pmid, personIdentifier) -- it mirrors the DynamoDB
+# composite key exactly (one row per person+article), so no cross-person collapse
+# is performed. frd (epoch seconds, UTC) is converted to a DATETIME on load to
+# match the rest of reciterdb; PM formats it for display.
+#
+# Memory: rows are streamed into the staging table one scan page at a time
+# (INSERT IGNORE), so peak RSS is bounded to one page regardless of corpus size.
+# INSERT IGNORE also collapses any (pmid, personIdentifier) collision (e.g. a
+# case-variant uid under the utf8mb4_unicode_ci PK) rather than aborting the load.
+#
+# Atomicity: mirrors retrieveNIH.py -- load into a `article_provenance_new`
+# staging table, validate it against production, then RENAME-swap. A failure here
+# leaves production untouched. run_all.py runs this step as NON-FATAL so a hiccup
+# does not block the nightly indexing SP (PM reads this table directly; nothing
+# downstream depends on it).
+
+import os
+import sys
+import time
+import random
+import logging
+import faulthandler
+import signal
+from datetime import datetime, timezone
+
+import boto3
+from botocore.config import Config
+from botocore.exceptions import BotoCoreError, ClientError
+import pymysql.cursors
+import pymysql.err
+
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('retrieveArticleProvenance.log', mode='w'),
+        logging.StreamHandler(sys.stdout),
+    ],
+)
+logger = logging.getLogger(__name__)
+
+faulthandler.enable(file=sys.stderr, all_threads=True)
+faulthandler.register(signal.SIGUSR1, file=sys.stderr, all_threads=True)
+
+DYNAMO_TABLE = 'ArticleProvenance'
+TARGET_TABLE = 'article_provenance'
+STAGING_TABLE = 'article_provenance_new'
+BACKUP_TABLE = 'article_provenance_backup'
+SCAN_PAGE_SIZE = 1000
+
+# Validation floor: reject a partial/empty scan that would otherwise seed or
+# replace production with too little data (matches retrieveNIH's min_rows).
+MIN_STAGING_ROWS = 100
+# Warn if more than this fraction of scanned items are skipped (data-quality signal).
+SKIP_RATIO_WARN = 0.10
+
+# Plausible bounds for frd (epoch seconds). Anything outside is treated as corrupt
+# and stored NULL rather than a bogus DATETIME. Lower bound = 2000-01-01 UTC.
+MIN_EPOCH_SECONDS = 946684800
+
+# DDL kept in sync with setup/createDatabaseTableReciterDb.sql and
+# setup/alter_add_article_provenance_v1.6.sql. Created defensively so a fresh
+# environment that has not yet run the migration still works.
+CREATE_TARGET_SQL = f"""
+CREATE TABLE IF NOT EXISTS `{TARGET_TABLE}` (
+  `pmid`               int(11)      NOT NULL,
+  `personIdentifier`   varchar(128) NOT NULL,
+  `firstRetrievalDate` datetime     DEFAULT NULL,
+  `retrievalStrategy`  varchar(64)  DEFAULT NULL,
+  `source`             varchar(32)  DEFAULT NULL,
+  PRIMARY KEY (`pmid`, `personIdentifier`),
+  KEY `idx_personIdentifier` (`personIdentifier`) USING BTREE
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
+"""
+
+LOAD_COLUMNS = ['pmid', 'personIdentifier', 'firstRetrievalDate',
+                'retrievalStrategy', 'source']
+
+
+def connect_db(max_retries=5, backoff_factor=1):
+    username = os.environ['DB_USERNAME']
+    password = os.environ['DB_PASSWORD']
+    hostname = os.environ['DB_HOST']
+    database = os.environ['DB_NAME']
+    for retry in range(max_retries):
+        try:
+            conn = pymysql.connect(
+                user=username,
+                password=password,
+                database=database,
+                host=hostname,
+                charset='utf8mb4',
+                cursorclass=pymysql.cursors.DictCursor,
+            )
+            logger.info('Connected to database %s on %s', database, hostname)
+            return conn
+        except pymysql.err.MySQLError as err:
+            logger.error('DB connect attempt %d failed: %s', retry + 1, err)
+            time.sleep(backoff_factor * (2 ** retry) + random.uniform(0, 1))
+    raise RuntimeError('Could not connect to database after retries')
+
+
+def epoch_to_datetime_str(frd):
+    """Convert an epoch-seconds value (DynamoDB Decimal/int/str) to a UTC
+    'YYYY-MM-DD HH:MM:SS' string, or None if absent/invalid/out-of-range. frd is
+    stored as UTC; Publication Manager formats it for display."""
+    if frd is None:
+        return None
+    try:
+        secs = int(frd)
+    except (TypeError, ValueError):
+        logger.warning('Unparseable frd value: %r; storing NULL', frd)
+        return None
+    # Reject implausible timestamps (corrupt data) rather than store a bogus year.
+    upper = int(datetime.now(tz=timezone.utc).timestamp()) + 86400  # now + 1 day skew
+    if secs < MIN_EPOCH_SECONDS or secs > upper:
+        logger.warning('Out-of-range frd value: %r; storing NULL', frd)
+        return None
+    try:
+        return datetime.fromtimestamp(secs, tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
+    except (OverflowError, OSError, ValueError):
+        logger.warning('Unconvertible frd value: %r; storing NULL', frd)
+        return None
+
+
+def scan_article_provenance(dynamo_table):
+    """Generator that yields pages (lists of items) from a full scan of the
+    ArticleProvenance table. Eventually-consistent read is fine for a nightly
+    snapshot (frd is immutable once written)."""
+    total = 0
+    last_key = None
+    while True:
+        kwargs = {'Limit': SCAN_PAGE_SIZE}
+        if last_key:
+            kwargs['ExclusiveStartKey'] = last_key
+        response = dynamo_table.scan(**kwargs)
+        items = response.get('Items', [])
+        total += len(items)
+        if items:
+            logger.info('Scanned %d items from %s (running total: %d).',
+                        len(items), DYNAMO_TABLE, total)
+            yield items
+        last_key = response.get('LastEvaluatedKey')
+        if not last_key:
+            break
+    logger.info('Finished scanning %s. Total items: %d.', DYNAMO_TABLE, total)
+
+
+def item_to_row(item):
+    """Map one ArticleProvenance item to a row tuple matching LOAD_COLUMNS, or
+    return None to skip (missing uid / non-numeric articleId)."""
+    uid = item.get('uid')
+    if not uid:
+        return None
+    try:
+        pmid = int(item.get('articleId'))
+    except (TypeError, ValueError):
+        return False  # distinguishes bad-pmid from no-uid for counting
+    first_retrieval = epoch_to_datetime_str(item.get('frd'))
+    rs = item.get('rs')
+    src = item.get('src')
+    rs = str(rs)[:64] if rs is not None else None
+    src = str(src)[:32] if src is not None else None
+    return (pmid, str(uid)[:128], first_retrieval, rs, src)
+
+
+def stream_into_staging(conn, cursor, dynamo_table):
+    """Scan ArticleProvenance and INSERT IGNORE each page into the staging table.
+    Streaming keeps peak memory to one page; INSERT IGNORE collapses any
+    (pmid, personIdentifier) duplicate (incl. case-variant uids under the
+    case-insensitive PK) instead of aborting. Returns scan/skip stats."""
+    col_list = ', '.join(f'`{c}`' for c in LOAD_COLUMNS)
+    placeholders = ', '.join(['%s'] * len(LOAD_COLUMNS))
+    sql = f"INSERT IGNORE INTO `{STAGING_TABLE}` ({col_list}) VALUES ({placeholders})"
+
+    scanned = skipped_no_uid = skipped_bad_pmid = 0
+    for page in scan_article_provenance(dynamo_table):
+        scanned += len(page)
+        page_rows = []
+        for item in page:
+            row = item_to_row(item)
+            if row is None:
+                skipped_no_uid += 1
+            elif row is False:
+                skipped_bad_pmid += 1
+            else:
+                page_rows.append(row)
+        if page_rows:
+            cursor.executemany(sql, page_rows)
+            conn.commit()
+
+    cursor.execute(f"SELECT COUNT(*) AS c FROM `{STAGING_TABLE}`")
+    staged = cursor.fetchone()['c']
+    skipped = skipped_no_uid + skipped_bad_pmid
+    logger.info('Scanned %d items; staged %d rows (skipped %d no-uid, %d bad-pmid).',
+                scanned, staged, skipped_no_uid, skipped_bad_pmid)
+    if scanned and (skipped / scanned) > SKIP_RATIO_WARN:
+        logger.warning('High skip ratio: %d/%d (%.1f%%) of scanned items were '
+                       'dropped; staged table may be partial.',
+                       skipped, scanned, 100.0 * skipped / scanned)
+    return {'scanned': scanned, 'staged': staged}
+
+
+def create_staging_table(cursor):
+    """(Re)create the staging table as an empty clone of production."""
+    cursor.execute(CREATE_TARGET_SQL)  # ensure production exists (fresh env)
+    cursor.execute(f"DROP TABLE IF EXISTS `{STAGING_TABLE}`")
+    cursor.execute(f"CREATE TABLE `{STAGING_TABLE}` LIKE `{TARGET_TABLE}`")
+    logger.info('Created staging table %s', STAGING_TABLE)
+
+
+def recover_orphaned_backup(conn, cursor):
+    """Self-heal from a prior run that died after RENAMEing production away but
+    before the swap completed: if production is gone but a backup exists, restore
+    it so we never fabricate an empty production table over a good backup."""
+    cursor.execute(f"SHOW TABLES LIKE '{TARGET_TABLE}'")
+    if cursor.fetchone():
+        return
+    cursor.execute(f"SHOW TABLES LIKE '{BACKUP_TABLE}'")
+    if cursor.fetchone():
+        logger.warning('Production %s missing but %s present (orphaned prior run); '
+                       'restoring backup before proceeding.', TARGET_TABLE, BACKUP_TABLE)
+        cursor.execute(f"RENAME TABLE `{BACKUP_TABLE}` TO `{TARGET_TABLE}`")
+        conn.commit()
+
+
+def validate_staging(cursor, min_rows=MIN_STAGING_ROWS, min_percentage=80):
+    """Guard against replacing a healthy production table with a partial/empty
+    scan. Requires the staging table to meet a row floor and (when production
+    already has data) to be at least min_percentage of the production row count."""
+    cursor.execute(f"SELECT COUNT(*) AS c FROM `{STAGING_TABLE}`")
+    staging_count = cursor.fetchone()['c']
+    cursor.execute(f"SELECT COUNT(*) AS c FROM `{TARGET_TABLE}`")
+    production_count = cursor.fetchone()['c']
+    logger.info('Validation: %s has %d rows, %s has %d rows',
+                STAGING_TABLE, staging_count, TARGET_TABLE, production_count)
+
+    if staging_count < min_rows:
+        logger.error('Validation FAILED: %s has %d rows (minimum %d)',
+                     STAGING_TABLE, staging_count, min_rows)
+        return False
+    if production_count > 0:
+        percentage = (staging_count / production_count) * 100
+        logger.info('Staging is %.1f%% of production', percentage)
+        if percentage < min_percentage:
+            logger.error('Validation FAILED: staging (%d) is only %.1f%% of '
+                         'production (%d); minimum %d%%',
+                         staging_count, percentage, production_count, min_percentage)
+            return False
+    logger.info('Validation PASSED for %s', STAGING_TABLE)
+    return True
+
+
+def atomic_swap(conn, cursor):
+    """Atomically swap staging into production: production -> backup, staging ->
+    production, in a single RENAME TABLE (atomic in MariaDB/InnoDB)."""
+    cursor.execute(f"DROP TABLE IF EXISTS `{BACKUP_TABLE}`")
+    rename_sql = (f"RENAME TABLE `{TARGET_TABLE}` TO `{BACKUP_TABLE}`, "
+                  f"`{STAGING_TABLE}` TO `{TARGET_TABLE}`")
+    logger.info('Executing atomic swap: %s', rename_sql)
+    cursor.execute(rename_sql)
+    conn.commit()
+    logger.info('Atomic swap completed for %s', TARGET_TABLE)
+
+
+def restore_from_backup(conn, cursor):
+    """Rename the backup table back to production if a swap failed mid-flight.
+    Because the swap is a single atomic RENAME, a raised swap means NEITHER table
+    was renamed and the backup was already dropped -- so 'no backup found' here is
+    the expected, SAFE outcome (production was never moved), not a data-loss event."""
+    cursor.execute(f"SHOW TABLES LIKE '{BACKUP_TABLE}'")
+    if not cursor.fetchone():
+        logger.info('No backup table %s to restore (production left untouched).',
+                    BACKUP_TABLE)
+        return
+    cursor.execute(f"SHOW TABLES LIKE '{TARGET_TABLE}'")
+    if cursor.fetchone():
+        cursor.execute(f"DROP TABLE IF EXISTS `{TARGET_TABLE}`")
+    cursor.execute(f"RENAME TABLE `{BACKUP_TABLE}` TO `{TARGET_TABLE}`")
+    conn.commit()
+    logger.info('Restored %s from backup', TARGET_TABLE)
+
+
+def cleanup_staging(conn, cursor):
+    cursor.execute(f"DROP TABLE IF EXISTS `{STAGING_TABLE}`")
+    conn.commit()
+    logger.info('Cleaned up staging table %s', STAGING_TABLE)
+
+
+def main():
+    conn = connect_db()
+    cursor = conn.cursor()
+
+    cfg = Config(retries={'max_attempts': 10, 'mode': 'standard'})
+    dynamo_table = boto3.resource('dynamodb', config=cfg).Table(DYNAMO_TABLE)
+
+    try:
+        recover_orphaned_backup(conn, cursor)
+        create_staging_table(cursor)
+        conn.commit()
+
+        stream_into_staging(conn, cursor, dynamo_table)
+
+        if not validate_staging(cursor):
+            logger.error('Validation failed; aborting swap to protect production.')
+            cleanup_staging(conn, cursor)
+            sys.exit(1)
+
+        try:
+            atomic_swap(conn, cursor)
+        except Exception as swap_err:
+            logger.error('Atomic swap failed: %s; attempting restore.', swap_err)
+            restore_from_backup(conn, cursor)
+            cleanup_staging(conn, cursor)
+            sys.exit(1)
+
+        logger.info('SUCCESS: %s updated with zero downtime.', TARGET_TABLE)
+
+    except (BotoCoreError, ClientError) as e:
+        logger.error('DynamoDB error during %s scan: %s', DYNAMO_TABLE, e)
+        cleanup_staging(conn, cursor)
+        sys.exit(1)
+    except pymysql.err.MySQLError as e:
+        logger.error('Database error: %s', e)
+        try:
+            cleanup_staging(conn, cursor)
+        except Exception:
+            pass
+        sys.exit(1)
+    finally:
+        cursor.close()
+        conn.close()
+        logger.info('Database connection closed.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/update/run_all.py b/update/run_all.py
index 6ebe245..61713a1 100644
--- a/update/run_all.py
+++ b/update/run_all.py
@@ -109,22 +109,30 @@ def upload_log_to_s3():
 
 # ------------- Main Flow -------------
 def main():
+    # (name, command, non_fatal). non_fatal=True steps log a warning and continue
+    # on failure instead of aborting the pipeline.
     scripts = [
-        ("executeFeatureGenerator", "python3 executeFeatureGenerator.py"),
-        ("retrieveArticles", "python3 retrieveArticles.py"),
-        ("retrieveNIH", "python3 retrieveNIH.py"),
-        ("retrieveReporter", "python3 retrieveReporter.py"),
-        ("nightlyIndexing", "bash run_nightly_indexing.sh"),
-        ("abstractImport", "python3 abstractImport.py"),
-        ("conflictsImport", "python3 conflictsImport.py")
+        ("executeFeatureGenerator", "python3 executeFeatureGenerator.py", False),
+        ("retrieveArticles", "python3 retrieveArticles.py", False),
+        ("retrieveNIH", "python3 retrieveNIH.py", False),
+        ("retrieveReporter", "python3 retrieveReporter.py", False),
+        # article_provenance feeds a PM display field only; nothing downstream
+        # depends on it, so a failure here must not block nightly indexing.
+        ("retrieveArticleProvenance", "python3 retrieveArticleProvenance.py", True),
+        ("nightlyIndexing", "bash run_nightly_indexing.sh", False),
+        ("abstractImport", "python3 abstractImport.py", False),
+        ("conflictsImport", "python3 conflictsImport.py", False)
     ]
 
     overall_success = True
 
-    for name, cmd in scripts:
+    for name, cmd, non_fatal in scripts:
         #ok = run_script(name, cmd)
         ok = run_script(name, cmd, timeout_seconds=int(os.getenv("SCRIPT_TIMEOUT_SECONDS", "15000")))
         if not ok:
+            if non_fatal:
+                logger.warning(f"⚠️ NON-FATAL: {name} failed; continuing pipeline.")
+                continue
             overall_success = False
             logger.error("Stopping pipeline due to script failure.")
             break

From fca972a755c63e494bf24b7822bb2864fc8d5d27 Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Sun, 14 Jun 2026 13:26:42 -0400
Subject: [PATCH 18/19] Revert "Merge pull request #96 from
 wcmc-its/feature/article-provenance-etl"

This reverts commit acd26bb561adfa6ec69de95c312aea24f43d6d76, reversing
changes made to ffd06311d22961791b40aa0e1c094566c1552e3a.
---
 Dockerfile                                  |   1 -
 README.md                                   |  13 +-
 setup/alter_add_article_provenance_v1.6.sql |  56 ----
 setup/createDatabaseTableReciterDb.sql      |  17 -
 update/retrieveArticleProvenance.py         | 353 --------------------
 update/run_all.py                           |  24 +-
 6 files changed, 11 insertions(+), 453 deletions(-)
 delete mode 100644 setup/alter_add_article_provenance_v1.6.sql
 delete mode 100644 update/retrieveArticleProvenance.py

diff --git a/Dockerfile b/Dockerfile
index a388a25..f4f344b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -14,7 +14,6 @@ ENV PYTHONUNBUFFERED=1
 # Copy additional Python scripts
 COPY update/retrieveNIH.py ./
 COPY update/retrieveReporter.py ./
-COPY update/retrieveArticleProvenance.py ./
 COPY update/retrieveAltmetric.py ./
 COPY update/retrieveArticles.py ./
 COPY update/updateReciterDB.py ./
diff --git a/README.md b/README.md
index dc22384..c39a9e1 100644
--- a/README.md
+++ b/README.md
@@ -69,15 +69,12 @@ CronJob (daily)
      ├─ 1. executeFeatureGenerator.py    Trigger ReCiter ML scoring API
      ├─ 2. retrieveArticles.py           Pull person/article data from S3 + DynamoDB
      ├─ 3. retrieveNIH.py                NIH iCite API → analysis_nih (atomic swap)
-     ├─ 4. retrieveReporter.py           NIH RePORTER grants → grant_reporter_* (reconcile)
-     ├─ 5. retrieveArticleProvenance.py  ArticleProvenance (DynamoDB) → article_provenance
-     │      └─ non-fatal: a failure here does not block nightly indexing
-     ├─ 6. run_nightly_indexing.sh        Run populateAnalysisSummaryTables_v2()
+     ├─ 4. run_nightly_indexing.sh        Run populateAnalysisSummaryTables_v2()
      │      ├─ Polls analysis_job_log every 3s for progress
      │      ├─ Auto-retries 3x with 60s backoff
      │      └─ Auto-restores from backup on failure
-     ├─ 7. abstractImport.py             PubMed abstracts from DynamoDB
-     └─ 8. conflictsImport.py            COI statements from DynamoDB
+     ├─ 5. abstractImport.py             PubMed abstracts from DynamoDB
+     └─ 6. conflictsImport.py            COI statements from DynamoDB
 ```
 
 **Key patterns:**
@@ -108,8 +105,6 @@ ReCiterDB/
 │   ├── run_nightly_indexing.sh                    # SP runner with monitoring/retry
 │   ├── retrieveArticles.py                        # S3 + DynamoDB article fetcher
 │   ├── retrieveNIH.py                             # NIH iCite fetcher (atomic swap)
-│   ├── retrieveReporter.py                        # NIH RePORTER grants fetcher (reconcile)
-│   ├── retrieveArticleProvenance.py               # ArticleProvenance (DynamoDB) → article_provenance
 │   ├── retrieveAltmetric.py                       # Altmetric API fetcher
 │   ├── updateReciterDB.py                         # Bulk loader (LOAD DATA LOCAL INFILE)
 │   ├── dataTransformer.py                         # ReCiter JSON → CSV
@@ -293,8 +288,6 @@ All defined in `setup/createEventsProceduresReciterDb.sql`.
 | `run_all.py` | EKS orchestrator: runs all pipeline steps in sequence with timeout enforcement, memory logging, and S3 log upload |
 | `retrieveArticles.py` | Fetches person and article data from S3 and DynamoDB in batches |
 | `retrieveNIH.py` | Fetches NIH iCite metrics in batches of 150; loads to staging table with validation and atomic swap |
-| `retrieveReporter.py` | Fetches NIH RePORTER grants/linkages and reconciles them into `grant_reporter_*` / `grant_provenance` |
-| `retrieveArticleProvenance.py` | Scans DynamoDB `ArticleProvenance` → `article_provenance` (pmid, personIdentifier) via staging + atomic swap; runs non-fatal |
 | `retrieveAltmetric.py` | Fetches Altmetric scores for articles published in the last 2 years |
 | `updateReciterDB.py` | Bulk data loader using `LOAD DATA LOCAL INFILE` with retry and reconnect logic |
 | `dataTransformer.py` | Transforms ReCiter JSON output to CSV format for all `person_*` tables |
diff --git a/setup/alter_add_article_provenance_v1.6.sql b/setup/alter_add_article_provenance_v1.6.sql
deleted file mode 100644
index 206ca9f..0000000
--- a/setup/alter_add_article_provenance_v1.6.sql
+++ /dev/null
@@ -1,56 +0,0 @@
--- =============================================================================
--- Migration: Add article_provenance table (v1.6)
--- =============================================================================
--- Creates the article_provenance table on EXISTING databases (dev, prod). Fresh
--- builds already get it from setup/createDatabaseTableReciterDb.sql (ReCiterDB#95).
---
--- WHAT IT IS:
---   First-retrieval provenance per (article, person), loaded nightly by
---   update/retrieveArticleProvenance.py from the ReCiter DynamoDB
---   `ArticleProvenance` table. That source table has a COMPOSITE key
---   uid (HASH, personIdentifier) + articleId (RANGE, PMID), so there is one item
---   per person+article. This table mirrors that key exactly -- PRIMARY KEY
---   (pmid, personIdentifier) -- rather than collapsing to one row per PMID.
---
---   Columns map from DynamoDB attributes:
---     pmid               <- articleId (String PMID -> INT)
---     personIdentifier   <- uid
---     firstRetrievalDate <- frd  (epoch SECONDS, UTC -> DATETIME)
---     retrievalStrategy  <- rs   (PM_UI_SEARCH, PM_AUTHOR, ...)
---     source             <- src  (PM, CTSC, GS, MAN, MAN_FROM_PM, ...)
---
--- WHY THIS MIGRATION EXISTS:
---   Publication Manager #737 displays "date a publication was first retrieved"
---   in /curate and reads this table by PMID. The table must exist before the PM
---   #737 branch is deployed against this database.
---
--- DURABILITY / ETL CONTRACT:
---   Loaded via a staging table (article_provenance_new) + atomic RENAME swap by
---   the nightly ETL, exactly like analysis_nih. It is NOT in
---   update/updateReciterDB.py's truncate list and is not touched by any nightly
---   stored procedure. A failure in the ETL step leaves production untouched and
---   does not block the rest of run_all.py (the step runs as non-fatal).
---
--- Safe to run on prod and dev. CREATE TABLE IF NOT EXISTS is a no-op on re-run
--- and additive only -- no existing table or row is modified.
--- =============================================================================
-
-CREATE TABLE IF NOT EXISTS `article_provenance` (
-  `pmid`               int(11)      NOT NULL,
-  `personIdentifier`   varchar(128) NOT NULL,
-  `firstRetrievalDate` datetime     DEFAULT NULL,
-  `retrievalStrategy`  varchar(64)  DEFAULT NULL,
-  `source`             varchar(32)  DEFAULT NULL,
-  PRIMARY KEY (`pmid`, `personIdentifier`),
-  KEY `idx_personIdentifier` (`personIdentifier`) USING BTREE
-) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-
--- -----------------------------------------------------------------------------
--- Verification
--- -----------------------------------------------------------------------------
-
-SELECT table_name, column_name, data_type, character_maximum_length, is_nullable, column_key
-FROM information_schema.columns
-WHERE table_schema = DATABASE()
-  AND table_name = 'article_provenance'
-ORDER BY ordinal_position;
diff --git a/setup/createDatabaseTableReciterDb.sql b/setup/createDatabaseTableReciterDb.sql
index d5127fe..91c8641 100644
--- a/setup/createDatabaseTableReciterDb.sql
+++ b/setup/createDatabaseTableReciterDb.sql
@@ -492,23 +492,6 @@ CREATE TABLE IF NOT EXISTS `analysis_temp_output_table_cell` (
   KEY `personIdentifier` (`personIdentifier`) USING BTREE
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
 
--- article_provenance: first-retrieval provenance per (article, person), loaded
--- nightly by update/retrieveArticleProvenance.py from the ReCiter DynamoDB
--- `ArticleProvenance` table (composite key uid + articleId). Keyed on
--- (pmid, personIdentifier) to mirror that source key exactly -- one row per
--- person+article. firstRetrievalDate is `frd` (epoch seconds, UTC) converted to
--- DATETIME. Loaded via staging->atomic-swap; NOT in any truncate list.
--- Consumed by Publication Manager #737 ("date a publication was first retrieved").
-CREATE TABLE IF NOT EXISTS `article_provenance` (
-  `pmid`               int(11)      NOT NULL,
-  `personIdentifier`   varchar(128) NOT NULL,
-  `firstRetrievalDate` datetime     DEFAULT NULL,
-  `retrievalStrategy`  varchar(64)  DEFAULT NULL,
-  `source`             varchar(32)  DEFAULT NULL,
-  PRIMARY KEY (`pmid`, `personIdentifier`),
-  KEY `idx_personIdentifier` (`personIdentifier`) USING BTREE
-) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-
 -- ============================================================================
 -- Journal Tables
 -- ============================================================================
diff --git a/update/retrieveArticleProvenance.py b/update/retrieveArticleProvenance.py
deleted file mode 100644
index ec06254..0000000
--- a/update/retrieveArticleProvenance.py
+++ /dev/null
@@ -1,353 +0,0 @@
-# retrieveArticleProvenance.py
-#
-# Nightly ETL step that scans the ReCiter DynamoDB `ArticleProvenance` table and
-# loads it into the reciterdb `article_provenance` table. Powers the Publication
-# Manager "date a publication was first retrieved" display in /curate
-# (wcmc-its/ReCiter-Publication-Manager#737); backend half of ReCiterDB#95.
-#
-# Source table (reciter.service.dynamo.ArticleProvenanceServiceImpl):
-#   - COMPOSITE key: `uid` (HASH, the personIdentifier/CWID) + `articleId` (RANGE,
-#     the PMID as a String). One item per (person, article) pair.
-#   - `frd` = first retrieval date, epoch SECONDS, written with if_not_exists so it
-#     is immutable once set (the first time that person retrieved that article).
-#   - `rs`  = first retrieval strategy (PM_UI_SEARCH, PM_AUTHOR, ...).
-#   - `src` = source (PM, CTSC, GS, MAN, MAN_FROM_PM, ...).
-#   - `ads` = String Set of all strategies seen (not loaded here).
-#
-# reciterdb target is keyed on (pmid, personIdentifier) -- it mirrors the DynamoDB
-# composite key exactly (one row per person+article), so no cross-person collapse
-# is performed. frd (epoch seconds, UTC) is converted to a DATETIME on load to
-# match the rest of reciterdb; PM formats it for display.
-#
-# Memory: rows are streamed into the staging table one scan page at a time
-# (INSERT IGNORE), so peak RSS is bounded to one page regardless of corpus size.
-# INSERT IGNORE also collapses any (pmid, personIdentifier) collision (e.g. a
-# case-variant uid under the utf8mb4_unicode_ci PK) rather than aborting the load.
-#
-# Atomicity: mirrors retrieveNIH.py -- load into a `article_provenance_new`
-# staging table, validate it against production, then RENAME-swap. A failure here
-# leaves production untouched. run_all.py runs this step as NON-FATAL so a hiccup
-# does not block the nightly indexing SP (PM reads this table directly; nothing
-# downstream depends on it).
-
-import os
-import sys
-import time
-import random
-import logging
-import faulthandler
-import signal
-from datetime import datetime, timezone
-
-import boto3
-from botocore.config import Config
-from botocore.exceptions import BotoCoreError, ClientError
-import pymysql.cursors
-import pymysql.err
-
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler('retrieveArticleProvenance.log', mode='w'),
-        logging.StreamHandler(sys.stdout),
-    ],
-)
-logger = logging.getLogger(__name__)
-
-faulthandler.enable(file=sys.stderr, all_threads=True)
-faulthandler.register(signal.SIGUSR1, file=sys.stderr, all_threads=True)
-
-DYNAMO_TABLE = 'ArticleProvenance'
-TARGET_TABLE = 'article_provenance'
-STAGING_TABLE = 'article_provenance_new'
-BACKUP_TABLE = 'article_provenance_backup'
-SCAN_PAGE_SIZE = 1000
-
-# Validation floor: reject a partial/empty scan that would otherwise seed or
-# replace production with too little data (matches retrieveNIH's min_rows).
-MIN_STAGING_ROWS = 100
-# Warn if more than this fraction of scanned items are skipped (data-quality signal).
-SKIP_RATIO_WARN = 0.10
-
-# Plausible bounds for frd (epoch seconds). Anything outside is treated as corrupt
-# and stored NULL rather than a bogus DATETIME. Lower bound = 2000-01-01 UTC.
-MIN_EPOCH_SECONDS = 946684800
-
-# DDL kept in sync with setup/createDatabaseTableReciterDb.sql and
-# setup/alter_add_article_provenance_v1.6.sql. Created defensively so a fresh
-# environment that has not yet run the migration still works.
-CREATE_TARGET_SQL = f"""
-CREATE TABLE IF NOT EXISTS `{TARGET_TABLE}` (
-  `pmid`               int(11)      NOT NULL,
-  `personIdentifier`   varchar(128) NOT NULL,
-  `firstRetrievalDate` datetime     DEFAULT NULL,
-  `retrievalStrategy`  varchar(64)  DEFAULT NULL,
-  `source`             varchar(32)  DEFAULT NULL,
-  PRIMARY KEY (`pmid`, `personIdentifier`),
-  KEY `idx_personIdentifier` (`personIdentifier`) USING BTREE
-) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
-"""
-
-LOAD_COLUMNS = ['pmid', 'personIdentifier', 'firstRetrievalDate',
-                'retrievalStrategy', 'source']
-
-
-def connect_db(max_retries=5, backoff_factor=1):
-    username = os.environ['DB_USERNAME']
-    password = os.environ['DB_PASSWORD']
-    hostname = os.environ['DB_HOST']
-    database = os.environ['DB_NAME']
-    for retry in range(max_retries):
-        try:
-            conn = pymysql.connect(
-                user=username,
-                password=password,
-                database=database,
-                host=hostname,
-                charset='utf8mb4',
-                cursorclass=pymysql.cursors.DictCursor,
-            )
-            logger.info('Connected to database %s on %s', database, hostname)
-            return conn
-        except pymysql.err.MySQLError as err:
-            logger.error('DB connect attempt %d failed: %s', retry + 1, err)
-            time.sleep(backoff_factor * (2 ** retry) + random.uniform(0, 1))
-    raise RuntimeError('Could not connect to database after retries')
-
-
-def epoch_to_datetime_str(frd):
-    """Convert an epoch-seconds value (DynamoDB Decimal/int/str) to a UTC
-    'YYYY-MM-DD HH:MM:SS' string, or None if absent/invalid/out-of-range. frd is
-    stored as UTC; Publication Manager formats it for display."""
-    if frd is None:
-        return None
-    try:
-        secs = int(frd)
-    except (TypeError, ValueError):
-        logger.warning('Unparseable frd value: %r; storing NULL', frd)
-        return None
-    # Reject implausible timestamps (corrupt data) rather than store a bogus year.
-    upper = int(datetime.now(tz=timezone.utc).timestamp()) + 86400  # now + 1 day skew
-    if secs < MIN_EPOCH_SECONDS or secs > upper:
-        logger.warning('Out-of-range frd value: %r; storing NULL', frd)
-        return None
-    try:
-        return datetime.fromtimestamp(secs, tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
-    except (OverflowError, OSError, ValueError):
-        logger.warning('Unconvertible frd value: %r; storing NULL', frd)
-        return None
-
-
-def scan_article_provenance(dynamo_table):
-    """Generator that yields pages (lists of items) from a full scan of the
-    ArticleProvenance table. Eventually-consistent read is fine for a nightly
-    snapshot (frd is immutable once written)."""
-    total = 0
-    last_key = None
-    while True:
-        kwargs = {'Limit': SCAN_PAGE_SIZE}
-        if last_key:
-            kwargs['ExclusiveStartKey'] = last_key
-        response = dynamo_table.scan(**kwargs)
-        items = response.get('Items', [])
-        total += len(items)
-        if items:
-            logger.info('Scanned %d items from %s (running total: %d).',
-                        len(items), DYNAMO_TABLE, total)
-            yield items
-        last_key = response.get('LastEvaluatedKey')
-        if not last_key:
-            break
-    logger.info('Finished scanning %s. Total items: %d.', DYNAMO_TABLE, total)
-
-
-def item_to_row(item):
-    """Map one ArticleProvenance item to a row tuple matching LOAD_COLUMNS, or
-    return None to skip (missing uid / non-numeric articleId)."""
-    uid = item.get('uid')
-    if not uid:
-        return None
-    try:
-        pmid = int(item.get('articleId'))
-    except (TypeError, ValueError):
-        return False  # distinguishes bad-pmid from no-uid for counting
-    first_retrieval = epoch_to_datetime_str(item.get('frd'))
-    rs = item.get('rs')
-    src = item.get('src')
-    rs = str(rs)[:64] if rs is not None else None
-    src = str(src)[:32] if src is not None else None
-    return (pmid, str(uid)[:128], first_retrieval, rs, src)
-
-
-def stream_into_staging(conn, cursor, dynamo_table):
-    """Scan ArticleProvenance and INSERT IGNORE each page into the staging table.
-    Streaming keeps peak memory to one page; INSERT IGNORE collapses any
-    (pmid, personIdentifier) duplicate (incl. case-variant uids under the
-    case-insensitive PK) instead of aborting. Returns scan/skip stats."""
-    col_list = ', '.join(f'`{c}`' for c in LOAD_COLUMNS)
-    placeholders = ', '.join(['%s'] * len(LOAD_COLUMNS))
-    sql = f"INSERT IGNORE INTO `{STAGING_TABLE}` ({col_list}) VALUES ({placeholders})"
-
-    scanned = skipped_no_uid = skipped_bad_pmid = 0
-    for page in scan_article_provenance(dynamo_table):
-        scanned += len(page)
-        page_rows = []
-        for item in page:
-            row = item_to_row(item)
-            if row is None:
-                skipped_no_uid += 1
-            elif row is False:
-                skipped_bad_pmid += 1
-            else:
-                page_rows.append(row)
-        if page_rows:
-            cursor.executemany(sql, page_rows)
-            conn.commit()
-
-    cursor.execute(f"SELECT COUNT(*) AS c FROM `{STAGING_TABLE}`")
-    staged = cursor.fetchone()['c']
-    skipped = skipped_no_uid + skipped_bad_pmid
-    logger.info('Scanned %d items; staged %d rows (skipped %d no-uid, %d bad-pmid).',
-                scanned, staged, skipped_no_uid, skipped_bad_pmid)
-    if scanned and (skipped / scanned) > SKIP_RATIO_WARN:
-        logger.warning('High skip ratio: %d/%d (%.1f%%) of scanned items were '
-                       'dropped; staged table may be partial.',
-                       skipped, scanned, 100.0 * skipped / scanned)
-    return {'scanned': scanned, 'staged': staged}
-
-
-def create_staging_table(cursor):
-    """(Re)create the staging table as an empty clone of production."""
-    cursor.execute(CREATE_TARGET_SQL)  # ensure production exists (fresh env)
-    cursor.execute(f"DROP TABLE IF EXISTS `{STAGING_TABLE}`")
-    cursor.execute(f"CREATE TABLE `{STAGING_TABLE}` LIKE `{TARGET_TABLE}`")
-    logger.info('Created staging table %s', STAGING_TABLE)
-
-
-def recover_orphaned_backup(conn, cursor):
-    """Self-heal from a prior run that died after RENAMEing production away but
-    before the swap completed: if production is gone but a backup exists, restore
-    it so we never fabricate an empty production table over a good backup."""
-    cursor.execute(f"SHOW TABLES LIKE '{TARGET_TABLE}'")
-    if cursor.fetchone():
-        return
-    cursor.execute(f"SHOW TABLES LIKE '{BACKUP_TABLE}'")
-    if cursor.fetchone():
-        logger.warning('Production %s missing but %s present (orphaned prior run); '
-                       'restoring backup before proceeding.', TARGET_TABLE, BACKUP_TABLE)
-        cursor.execute(f"RENAME TABLE `{BACKUP_TABLE}` TO `{TARGET_TABLE}`")
-        conn.commit()
-
-
-def validate_staging(cursor, min_rows=MIN_STAGING_ROWS, min_percentage=80):
-    """Guard against replacing a healthy production table with a partial/empty
-    scan. Requires the staging table to meet a row floor and (when production
-    already has data) to be at least min_percentage of the production row count."""
-    cursor.execute(f"SELECT COUNT(*) AS c FROM `{STAGING_TABLE}`")
-    staging_count = cursor.fetchone()['c']
-    cursor.execute(f"SELECT COUNT(*) AS c FROM `{TARGET_TABLE}`")
-    production_count = cursor.fetchone()['c']
-    logger.info('Validation: %s has %d rows, %s has %d rows',
-                STAGING_TABLE, staging_count, TARGET_TABLE, production_count)
-
-    if staging_count < min_rows:
-        logger.error('Validation FAILED: %s has %d rows (minimum %d)',
-                     STAGING_TABLE, staging_count, min_rows)
-        return False
-    if production_count > 0:
-        percentage = (staging_count / production_count) * 100
-        logger.info('Staging is %.1f%% of production', percentage)
-        if percentage < min_percentage:
-            logger.error('Validation FAILED: staging (%d) is only %.1f%% of '
-                         'production (%d); minimum %d%%',
-                         staging_count, percentage, production_count, min_percentage)
-            return False
-    logger.info('Validation PASSED for %s', STAGING_TABLE)
-    return True
-
-
-def atomic_swap(conn, cursor):
-    """Atomically swap staging into production: production -> backup, staging ->
-    production, in a single RENAME TABLE (atomic in MariaDB/InnoDB)."""
-    cursor.execute(f"DROP TABLE IF EXISTS `{BACKUP_TABLE}`")
-    rename_sql = (f"RENAME TABLE `{TARGET_TABLE}` TO `{BACKUP_TABLE}`, "
-                  f"`{STAGING_TABLE}` TO `{TARGET_TABLE}`")
-    logger.info('Executing atomic swap: %s', rename_sql)
-    cursor.execute(rename_sql)
-    conn.commit()
-    logger.info('Atomic swap completed for %s', TARGET_TABLE)
-
-
-def restore_from_backup(conn, cursor):
-    """Rename the backup table back to production if a swap failed mid-flight.
-    Because the swap is a single atomic RENAME, a raised swap means NEITHER table
-    was renamed and the backup was already dropped -- so 'no backup found' here is
-    the expected, SAFE outcome (production was never moved), not a data-loss event."""
-    cursor.execute(f"SHOW TABLES LIKE '{BACKUP_TABLE}'")
-    if not cursor.fetchone():
-        logger.info('No backup table %s to restore (production left untouched).',
-                    BACKUP_TABLE)
-        return
-    cursor.execute(f"SHOW TABLES LIKE '{TARGET_TABLE}'")
-    if cursor.fetchone():
-        cursor.execute(f"DROP TABLE IF EXISTS `{TARGET_TABLE}`")
-    cursor.execute(f"RENAME TABLE `{BACKUP_TABLE}` TO `{TARGET_TABLE}`")
-    conn.commit()
-    logger.info('Restored %s from backup', TARGET_TABLE)
-
-
-def cleanup_staging(conn, cursor):
-    cursor.execute(f"DROP TABLE IF EXISTS `{STAGING_TABLE}`")
-    conn.commit()
-    logger.info('Cleaned up staging table %s', STAGING_TABLE)
-
-
-def main():
-    conn = connect_db()
-    cursor = conn.cursor()
-
-    cfg = Config(retries={'max_attempts': 10, 'mode': 'standard'})
-    dynamo_table = boto3.resource('dynamodb', config=cfg).Table(DYNAMO_TABLE)
-
-    try:
-        recover_orphaned_backup(conn, cursor)
-        create_staging_table(cursor)
-        conn.commit()
-
-        stream_into_staging(conn, cursor, dynamo_table)
-
-        if not validate_staging(cursor):
-            logger.error('Validation failed; aborting swap to protect production.')
-            cleanup_staging(conn, cursor)
-            sys.exit(1)
-
-        try:
-            atomic_swap(conn, cursor)
-        except Exception as swap_err:
-            logger.error('Atomic swap failed: %s; attempting restore.', swap_err)
-            restore_from_backup(conn, cursor)
-            cleanup_staging(conn, cursor)
-            sys.exit(1)
-
-        logger.info('SUCCESS: %s updated with zero downtime.', TARGET_TABLE)
-
-    except (BotoCoreError, ClientError) as e:
-        logger.error('DynamoDB error during %s scan: %s', DYNAMO_TABLE, e)
-        cleanup_staging(conn, cursor)
-        sys.exit(1)
-    except pymysql.err.MySQLError as e:
-        logger.error('Database error: %s', e)
-        try:
-            cleanup_staging(conn, cursor)
-        except Exception:
-            pass
-        sys.exit(1)
-    finally:
-        cursor.close()
-        conn.close()
-        logger.info('Database connection closed.')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/update/run_all.py b/update/run_all.py
index 61713a1..6ebe245 100644
--- a/update/run_all.py
+++ b/update/run_all.py
@@ -109,30 +109,22 @@ def upload_log_to_s3():
 
 # ------------- Main Flow -------------
 def main():
-    # (name, command, non_fatal). non_fatal=True steps log a warning and continue
-    # on failure instead of aborting the pipeline.
     scripts = [
-        ("executeFeatureGenerator", "python3 executeFeatureGenerator.py", False),
-        ("retrieveArticles", "python3 retrieveArticles.py", False),
-        ("retrieveNIH", "python3 retrieveNIH.py", False),
-        ("retrieveReporter", "python3 retrieveReporter.py", False),
-        # article_provenance feeds a PM display field only; nothing downstream
-        # depends on it, so a failure here must not block nightly indexing.
-        ("retrieveArticleProvenance", "python3 retrieveArticleProvenance.py", True),
-        ("nightlyIndexing", "bash run_nightly_indexing.sh", False),
-        ("abstractImport", "python3 abstractImport.py", False),
-        ("conflictsImport", "python3 conflictsImport.py", False)
+        ("executeFeatureGenerator", "python3 executeFeatureGenerator.py"),
+        ("retrieveArticles", "python3 retrieveArticles.py"),
+        ("retrieveNIH", "python3 retrieveNIH.py"),
+        ("retrieveReporter", "python3 retrieveReporter.py"),
+        ("nightlyIndexing", "bash run_nightly_indexing.sh"),
+        ("abstractImport", "python3 abstractImport.py"),
+        ("conflictsImport", "python3 conflictsImport.py")
     ]
 
     overall_success = True
 
-    for name, cmd, non_fatal in scripts:
+    for name, cmd in scripts:
         #ok = run_script(name, cmd)
         ok = run_script(name, cmd, timeout_seconds=int(os.getenv("SCRIPT_TIMEOUT_SECONDS", "15000")))
         if not ok:
-            if non_fatal:
-                logger.warning(f"⚠️ NON-FATAL: {name} failed; continuing pipeline.")
-                continue
             overall_success = False
             logger.error("Stopping pipeline due to script failure.")
             break

From 408c4074b1bd188a720112d779652ffe2bb72cfd Mon Sep 17 00:00:00 2001
From: Paul Albert <palbert1@gmail.com>
Date: Wed, 17 Jun 2026 15:58:39 -0400
Subject: [PATCH 19/19] feat(setup): mirror RBAC permission tables +
 impersonatedByUserID column

3-places mirror of ReCiter-Publication-Manager schema:
- add admin_permissions, admin_role_permissions, admin_permission_resources
  to createDatabaseTableReciterDb.sql
- add impersonatedByUserID column to admin_feedback_log
- add table_admin_permissions.sql with the permission/role-mapping/nav seed

Mirrors PM scripts/migrations/add-permission-tables.sql and
add-impersonated-by-feedbacklog.sql. Refs #739, #733.
---
 setup/createDatabaseTableReciterDb.sql | 44 ++++++++++++++++++++
 setup/table_admin_permissions.sql      | 56 ++++++++++++++++++++++++++
 2 files changed, 100 insertions(+)
 create mode 100644 setup/table_admin_permissions.sql

diff --git a/setup/createDatabaseTableReciterDb.sql b/setup/createDatabaseTableReciterDb.sql
index 91c8641..ea3bd8e 100644
--- a/setup/createDatabaseTableReciterDb.sql
+++ b/setup/createDatabaseTableReciterDb.sql
@@ -48,6 +48,7 @@ CREATE TABLE IF NOT EXISTS `admin_feedback_log` (
   `personIdentifier` varchar(20) DEFAULT NULL,
   `articleIdentifier` int(11) DEFAULT NULL,
   `feedback` varchar(11) DEFAULT NULL,
+  `impersonatedByUserID` int(11) DEFAULT NULL,
   `createTimestamp` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',
   `modifyTimestamp` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
   PRIMARY KEY (`feedbackID`),
@@ -96,6 +97,49 @@ CREATE TABLE IF NOT EXISTS `admin_roles` (
   PRIMARY KEY (`roleID`)
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
 
+-- Data-driven RBAC permission tables.
+-- Mirror of ReCiter-Publication-Manager scripts/migrations/add-permission-tables.sql
+-- (3-places rule). Seed data lives in setup/table_admin_permissions.sql.
+CREATE TABLE IF NOT EXISTS `admin_permissions` (
+  `permissionID` int(11) NOT NULL AUTO_INCREMENT,
+  `permissionKey` varchar(128) NOT NULL,
+  `label` varchar(255) NOT NULL,
+  `description` text DEFAULT NULL,
+  `category` varchar(64) NOT NULL,
+  `createTimestamp` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00',
+  `modifyTimestamp` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
+  PRIMARY KEY (`permissionID`),
+  UNIQUE KEY `uq_permissionKey` (`permissionKey`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+CREATE TABLE IF NOT EXISTS `admin_role_permissions` (
+  `id` int(11) NOT NULL AUTO_INCREMENT,
+  `roleID` int(11) NOT NULL,
+  `permissionID` int(11) NOT NULL,
+  `createTimestamp` timestamp NOT NULL DEFAULT current_timestamp(),
+  PRIMARY KEY (`id`),
+  UNIQUE KEY `uq_role_permission` (`roleID`,`permissionID`),
+  KEY `idx_roleID` (`roleID`),
+  KEY `idx_permissionID` (`permissionID`),
+  CONSTRAINT `fk_rp_role` FOREIGN KEY (`roleID`) REFERENCES `admin_roles` (`roleID`) ON DELETE CASCADE,
+  CONSTRAINT `fk_rp_permission` FOREIGN KEY (`permissionID`) REFERENCES `admin_permissions` (`permissionID`) ON DELETE CASCADE
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
+CREATE TABLE IF NOT EXISTS `admin_permission_resources` (
+  `id` int(11) NOT NULL AUTO_INCREMENT,
+  `permissionID` int(11) NOT NULL,
+  `resourceType` varchar(32) NOT NULL,
+  `resourceKey` varchar(128) NOT NULL,
+  `displayOrder` int(11) NOT NULL DEFAULT 0,
+  `icon` varchar(64) DEFAULT NULL,
+  `label` varchar(255) NOT NULL,
+  `route` varchar(255) DEFAULT NULL,
+  `createTimestamp` timestamp NOT NULL DEFAULT current_timestamp(),
+  PRIMARY KEY (`id`),
+  KEY `idx_pr_permissionID` (`permissionID`),
+  CONSTRAINT `fk_pr_permission` FOREIGN KEY (`permissionID`) REFERENCES `admin_permissions` (`permissionID`) ON DELETE CASCADE
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
+
 CREATE TABLE IF NOT EXISTS `admin_settings` (
   `viewName` varchar(200) NOT NULL,
   `viewAttributes` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_bin DEFAULT NULL CHECK (json_valid(`viewAttributes`)),
diff --git a/setup/table_admin_permissions.sql b/setup/table_admin_permissions.sql
new file mode 100644
index 0000000..c1d37af
--- /dev/null
+++ b/setup/table_admin_permissions.sql
@@ -0,0 +1,56 @@
+-- Seed data for the data-driven RBAC permission tables.
+-- Mirror of the SEED section of ReCiter-Publication-Manager
+-- scripts/migrations/add-permission-tables.sql (3-places rule).
+--
+-- Run ONCE per environment, after createDatabaseTableReciterDb.sql has created
+-- the tables and table_admin_roles.sql has seeded admin_roles. The role->permission
+-- seed joins on admin_roles.roleLabel, so it adapts to whatever roles an
+-- environment defines; 'Curator_Scoped' is a harmless no-op where that role
+-- does not exist.
+
+-- 1. Permissions (7)
+INSERT INTO `admin_permissions` (`permissionKey`, `label`, `description`, `category`) VALUES
+  ('canCurate', 'Curate Publications', 'Accept or reject article suggestions for people', 'Curation'),
+  ('canSearch', 'Search Identities', 'Search and browse the identity directory', 'Navigation'),
+  ('canReport', 'Create Reports', 'Generate publication reports and export data', 'Reporting'),
+  ('canManageUsers', 'Manage Users', 'Create, edit, and deactivate user accounts and assign roles', 'Administration'),
+  ('canConfigure', 'Configuration', 'Edit application settings, labels, and field visibility', 'Administration'),
+  ('canManageNotifications', 'Manage Notifications', 'Configure notification preferences', 'Communication'),
+  ('canManageProfile', 'Manage Profile', 'View and edit user profile information', 'Profile');
+
+-- 2. Role -> permission mappings (reproduces current behavior)
+INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`)
+  SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap
+  WHERE ar.roleLabel = 'Superuser';                                                   -- all 7
+INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`)
+  SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap
+  WHERE ar.roleLabel = 'Curator_All'                 AND ap.permissionKey IN ('canCurate','canSearch');
+INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`)
+  SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap
+  WHERE ar.roleLabel = 'Curator_Self'                AND ap.permissionKey IN ('canCurate');
+INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`)
+  SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap
+  WHERE ar.roleLabel = 'Curator_Scoped'              AND ap.permissionKey IN ('canCurate','canSearch');
+INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`)
+  SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap
+  WHERE ar.roleLabel = 'Curator_Department'          AND ap.permissionKey IN ('canCurate','canSearch');
+INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`)
+  SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap
+  WHERE ar.roleLabel = 'Curator_Department_Delegate' AND ap.permissionKey IN ('canCurate','canSearch');
+INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`)
+  SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap
+  WHERE ar.roleLabel = 'Reporter_All'                AND ap.permissionKey IN ('canReport','canSearch');
+
+-- 3. Nav resources (sidebar items)
+INSERT INTO `admin_permission_resources` (`permissionID`, `resourceType`, `resourceKey`, `displayOrder`, `icon`, `label`, `route`)
+  SELECT ap.permissionID, v.resourceType, v.resourceKey, v.displayOrder, v.icon, v.label, v.route
+  FROM admin_permissions ap
+  JOIN (
+    SELECT 'canSearch' AS pk, 'nav' AS resourceType, 'nav_search' AS resourceKey, 1 AS displayOrder, 'Search' AS icon, 'Find People' AS label, '/search' AS route
+    UNION ALL SELECT 'canCurate', 'nav', 'nav_curate', 2, 'LocalLibrary', 'Curate Publications', '/curate'
+    UNION ALL SELECT 'canReport', 'nav', 'nav_report', 3, 'Assessment', 'Create Reports', '/report'
+    UNION ALL SELECT 'canManageNotifications', 'nav', 'nav_notifications', 4, 'NotificationsActive', 'Manage Notifications', '/notifications'
+    UNION ALL SELECT 'canManageProfile', 'nav', 'nav_profile', 5, 'AccountCircle', 'Manage Profile', '/manageprofile'
+    UNION ALL SELECT 'canManageUsers', 'nav', 'nav_users', 6, 'Group', 'Manage Users', '/manageusers'
+    UNION ALL SELECT 'canConfigure', 'nav', 'nav_config', 7, 'Settings', 'Configuration', '/configuration'
+  ) v ON ap.permissionKey = v.pk;