From 84fd32f79602d84069f07a4debb60aca5a085dbb Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Thu, 5 Mar 2026 12:09:20 -0500 Subject: [PATCH 01/19] Fix validation blocking table swap when production has duplicate pmids The analysis_nih table was loaded with ~527K rows (2x duplicates) on Dec 18, 2025. Since then, every nightly run retrieves the correct ~267K rows but validation rejects the swap (267K/527K = 50.7% < 80% threshold). Changes: - validate_data() now detects duplicate pmids in production and uses the unique count for percentage comparison, allowing the swap to self-heal - create_staging_tables() adds a UNIQUE constraint on pmid for analysis_nih_new to prevent future duplicate inserts - Added check_production_integrity() utility for diagnostic use - Schema: changed analysis_nih.idx_pmid from KEY to UNIQUE KEY Co-Authored-By: Claude Opus 4.6 --- setup/createDatabaseTableReciterDb.sql | 2 +- update/retrieveNIH.py | 37 +++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/setup/createDatabaseTableReciterDb.sql b/setup/createDatabaseTableReciterDb.sql index e94efd2..13d04d8 100644 --- a/setup/createDatabaseTableReciterDb.sql +++ b/setup/createDatabaseTableReciterDb.sql @@ -260,7 +260,7 @@ CREATE TABLE IF NOT EXISTS `analysis_nih` ( `x_coord` float(5,4) DEFAULT NULL, `y_coord` float(5,4) DEFAULT NULL, PRIMARY KEY (`id`), - KEY `idx_pmid` (`pmid`) USING BTREE + UNIQUE KEY `idx_pmid` (`pmid`) USING BTREE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; CREATE TABLE IF NOT EXISTS `analysis_nih_cites` ( diff --git a/update/retrieveNIH.py b/update/retrieveNIH.py index ae7f01d..97b40bd 100644 --- a/update/retrieveNIH.py +++ b/update/retrieveNIH.py @@ -207,6 +207,14 @@ def create_staging_tables(mysql_cursor, tables): mysql_cursor.execute(f"CREATE TABLE {staging_table} LIKE {table_name}") mysql_cursor.execute(f"ALTER TABLE {staging_table} MODIFY COLUMN id int(11) NOT NULL AUTO_INCREMENT") logger.info(f"Created staging table: {staging_table}") + # Add unique constraint on pmid for analysis_nih_new to prevent duplicates + try: + mysql_cursor.execute( + "ALTER TABLE analysis_nih_new ADD UNIQUE KEY uk_pmid (pmid)" + ) + logger.info("Added UNIQUE constraint on pmid for analysis_nih_new") + except Exception as e: + logger.warning(f"Could not add UNIQUE constraint (may already exist): {e}") def atomic_table_swap(mysql_db, mysql_cursor, tables): """ @@ -274,6 +282,7 @@ def validate_data(mysql_cursor, staging_table, production_table, min_rows=100, m Validate staging table has sufficient data before swap. - Must have at least min_rows - Must have at least min_percentage of production table's row count + - Detects duplicate pmids in production (corruption) and uses unique count instead """ mysql_cursor.execute(f"SELECT COUNT(*) as cnt FROM {staging_table}") staging_count = mysql_cursor.fetchone()['cnt'] @@ -281,8 +290,19 @@ def validate_data(mysql_cursor, staging_table, production_table, min_rows=100, m mysql_cursor.execute(f"SELECT COUNT(*) as cnt FROM {production_table}") production_count = mysql_cursor.fetchone()['cnt'] + # Check for duplicates in production (corruption detection) + mysql_cursor.execute(f"SELECT COUNT(DISTINCT pmid) as cnt FROM {production_table}") + unique_production = mysql_cursor.fetchone()['cnt'] + logger.info(f"Validation: {staging_table} has {staging_count} rows, " - f"{production_table} has {production_count} rows") + f"{production_table} has {production_count} rows " + f"({unique_production} unique pmids)") + + if production_count != unique_production: + logger.warning(f"CORRUPTION DETECTED: {production_table} has " + f"{production_count - unique_production} duplicate rows. " + f"Using unique count ({unique_production}) for validation.") + production_count = unique_production # Use deduped count for comparison # Check minimum rows if staging_count < min_rows: @@ -303,6 +323,21 @@ def validate_data(mysql_cursor, staging_table, production_table, min_rows=100, m logger.info(f"Validation PASSED for {staging_table}") return True +def check_production_integrity(mysql_cursor, table_name, key_column='pmid'): + """Check if production table has duplicate key values (corruption indicator).""" + mysql_cursor.execute(f""" + SELECT COUNT(*) as total_rows, COUNT(DISTINCT {key_column}) as unique_keys + FROM {table_name} + """) + result = mysql_cursor.fetchone() + total = result['total_rows'] + unique = result['unique_keys'] + if total != unique: + logger.warning(f"INTEGRITY CHECK: {table_name} has {total} rows but only " + f"{unique} unique {key_column} values ({total - unique} duplicates)") + return False + return True + ######### if __name__ == '__main__': From 4605fc2945355d5393cc198e2923e64ccbbb1664 Mon Sep 17 00:00:00 2001 From: paulalbert1 Date: Mon, 27 Mar 2023 08:59:27 -0400 Subject: [PATCH 02/19] Updated journal impact score inference --- setup/insertBaselineDataReciterDb.sql | 353 -------------------------- setup/table_admin_roles.sql | 9 + update/temp/.gitkeep | 0 3 files changed, 9 insertions(+), 353 deletions(-) delete mode 100644 setup/insertBaselineDataReciterDb.sql create mode 100644 setup/table_admin_roles.sql delete mode 100644 update/temp/.gitkeep diff --git a/setup/insertBaselineDataReciterDb.sql b/setup/insertBaselineDataReciterDb.sql deleted file mode 100644 index 081b1c3..0000000 --- a/setup/insertBaselineDataReciterDb.sql +++ /dev/null @@ -1,353 +0,0 @@ -SET FOREIGN_KEY_CHECKS = 0; -TRUNCATE `admin_roles`; - -LOCK TABLES `admin_roles` WRITE; -INSERT INTO `admin_roles` (`roleID`, `roleLabel`) VALUES - (1,'Superuser'), - (2,'Curator_All'), - (3,'Reporter_All'), - (4,'Curator_Self'), - (5,'Curator_Department'), - (6,'Curator_Department_Delegate'); -UNLOCK TABLES; - -TRUNCATE `analysis_special_characters`; - -LOCK TABLES `analysis_special_characters` WRITE; -TRUNCATE `analysis_special_characters`; -INSERT INTO `analysis_special_characters` (`id`, `specialCharacter`, `RTFescape`, `characterName`) VALUES - (1,'͵','\\\'82','Low left single quote'), - (2,'ƒ','\\\'83','Florin'), - (3,'„','\\\'84','Low left double quote'), - (4,'…','\\\'85','Ellipsis'), - (5,'†','\\\'86','Dagger'), - (6,'‡','\\\'87','Double dagger'), - (7,'∘','\\\'88','Circumflex'), - (8,'‰','\\\'89','Permil'), - (9,'Š','\\\'8a','S-caron'), - (10,'‹','\\\'8b','Single left guillemet'), - (11,'Œ','\\\'8c','OE-ligature'), - (12,'Ž','\\\'8e','Z-caron'), - (13,'‘','\\\'91','Left single quote'), - (14,'’','\\\'92','Right single quote'), - (15,'“','\\\'93','Left double quote'), - (16,'”','\\\'94','Right double quote'), - (17,'•','\\\'95','Bullet'), - (18,'–','\\\'96','En dash'), - (19,'—','\\\'97','Em dash'), - (20,'~','\\\'98','Tilde'), - (21,'™','\\\'99','Trademark'), - (22,'š','\\\'9a','s-caron'), - (23,'›','\\\'9b','Single right guillemet'), - (24,'œ','\\\'9c','oe ligature'), - (25,'ž','\\\'9e','z-caron'), - (26,'Ÿ','\\\'9f','Y-diaeresis'), - (27,'¡','\\\'a1','Inverted exclamation point'), - (28,'¢','\\\'a2','Cent sign'), - (29,'£','\\\'a3','Pound sign'), - (30,'¤','\\\'a4','General currency sign'), - (31,'¥','\\\'a5','Yen sign'), - (32,'¦','\\\'a6','Broken vertical bar'), - (33,'§','\\\'a7','Section sign'), - (34,'¨','\\\'a8','Spacing diaeresis'), - (35,'©','\\\'a9','Copyright'), - (36,'ª','\\\'aa','Feminine ordinal'), - (37,'«','\\\'ab','Left angle quotes'), - (38,'¬','\\\'ac','Not sign'), - (39,'(-)','\\-','Soft hyphen'), - (40,'®','\\\'ae','Registered trademark'), - (41,'¯','\\\'af','Macron accent'), - (42,'°','\\\'b0','Degree sign'), - (43,'±','\\\'b1','Plus or minus sign'), - (44,'²','\\\'b2','Superscript 2'), - (45,'³','\\\'b3','Superscript 3'), - (46,'´','\\\'b4','Acute accent'), - (47,'µ','\\\'b5','Micro sign (Greek mu)'), - (48,'¶','\\\'b6','Paragraph sign'), - (49,'·','\\\'b7','Middle dot'), - (50,'¸','\\\'b8','Cedilla'), - (51,'¹','\\\'b9','Superscript 1'), - (52,'º','\\\'ba','Masculine ordinal'), - (53,'»','\\\'bb','Right angle quotes'), - (54,'¼','\\\'bc','One-fourth fraction'), - (55,'½','\\\'bd','One-half fraction'), - (56,'¾','\\\'be','Three-fourths fraction'), - (57,'¿','\\\'bf','Inverted question mark'), - (58,'À','\\\'c0','A-grave'), - (59,'Á','\\\'c1','A-acute'), - (60,'Â','\\\'c2','A-circumflex'), - (61,'Ã','\\\'c3','A-tilde'), - (62,'Ä','\\\'c4','A-diaeresis'), - (63,'Å','\\\'c5','A-ring'), - (64,'Æ','\\\'c6','AE-ligature'), - (65,'Ç','\\\'c7','C-cedilla'), - (66,'È','\\\'c8','E-grave'), - (67,'É','\\\'c9','E-acute'), - (68,'Ê','\\\'ca','E-circumflex'), - (69,'Ë','\\\'cb','E-diaeresis'), - (70,'Ì','\\\'cc','I-grave'), - (71,'Í','\\\'cd','I-acute'), - (72,'Î','\\\'ce','I-circumflex'), - (73,'Ï','\\\'cf','I-diaeresis'), - (74,'Ð','\\\'d0','Uppercase edh'), - (75,'Ñ','\\\'d1','N-tilde'), - (76,'Ò','\\\'d2','O-grave'), - (77,'Ó','\\\'d3','O-acute'), - (78,'Ô','\\\'d4','O-circumflex'), - (79,'Õ','\\\'d5','O-tilde'), - (80,'Ö','\\\'d6','O-diaeresis'), - (81,'×','\\\'d7','Multiply sign'), - (82,'Ø','\\\'d8','O-slash'), - (83,'Ù','\\\'d9','U-grave'), - (84,'Ú','\\\'da','U-acute'), - (85,'Û','\\\'db','U-circumflex'), - (86,'Ü','\\\'dc','U-diaeresis'), - (87,'Ý','\\\'dd','Y-acute'), - (88,'Þ','\\\'de','Uppercase thorn'), - (89,'ß','\\\'df','German ess-zed'), - (90,'à','\\\'e0','a-grave'), - (91,'á','\\\'e1','a-acute'), - (92,'â','\\\'e2','a-circumflex'), - (93,'ã','\\\'e3','a-tilde'), - (94,'ä','\\\'e4','a-diaeresis'), - (95,'å','\\\'e5','a-ring'), - (96,'æ','\\\'e6','ae-ligature'), - (97,'ç','\\\'e7','c-cedilla'), - (98,'è','\\\'e8','e-grave'), - (99,'é','\\\'e9','e-acute'), - (100,'ê','\\\'ea','e-circumflex'), - (101,'ë','\\\'eb','e-diaeresis'), - (102,'ì','\\\'ec','i-grave'), - (103,'í','\\\'ed','i-acute'), - (104,'î','\\\'ee','i-circumflex'), - (105,'ï','\\\'ef','i-diaeresis'), - (106,'ð','\\\'f0','Lowercase edh'), - (107,'ñ','\\\'f1','n-tilde'), - (108,'ò','\\\'f2','o-grave'), - (109,'ó','\\\'f3','o-acute'), - (110,'ô','\\\'f4','o-circumflex'), - (111,'õ','\\\'f5','o-tilde'), - (112,'ö','\\\'f6','o-diaeresis'), - (113,'÷','\\\'f7','Division sign'), - (114,'ø','\\\'f8','o-slash'), - (115,'ù','\\\'f9','u-grave'), - (116,'ú','\\\'fa','u-acute'), - (117,'û','\\\'fb','u-circumflex'), - (118,'ü','\\\'fc','u-diaeresis'), - (119,'ý','\\\'fd','y-acute'), - (120,'þ','\\\'fe','Lowercase thorn'), - (121,'ÿ','\\\'ff','y-diaeresis'), - (122,'č','\\u269 ',NULL), - (123,'ć','\\u263 ',NULL), - (124,'β','\\u946 ','beta'), - (125,'Α','\\u913 ','Alpha'), - (126,'Β','\\u914 ','Beta'), - (127,'Γ','\\u915 ','Gamma'), - (128,'Δ','\\u916 ','Delta'), - (129,'Ε','\\u917 ','Epsilon'), - (130,'Ζ','\\u918 ','Zeta'), - (131,'Η','\\u919 ','Eta'), - (132,'Θ','\\u920 ','Theta'), - (133,'Ι','\\u921 ','Iota'), - (134,'Κ','\\u922 ','Kappa'), - (135,'Λ','\\u923 ','Lambda'), - (136,'Μ','\\u924 ','Mu'), - (137,'Ν','\\u925 ','Nu'), - (138,'Ξ','\\u926 ','Xi'), - (139,'Ο','\\u927 ','Omicron'), - (140,'Π','\\u928 ','Pi'), - (141,'Ρ','\\u929 ','Rho'), - (142,'Σ','\\u931 ','Sigma'), - (143,'Τ','\\u932 ','Tau'), - (144,'Υ','\\u933 ','Upsilon'), - (145,'Φ','\\u934 ','Phi'), - (146,'Χ','\\u935 ','Chi'), - (147,'Ψ','\\u936 ','Psi'), - (148,'Ω','\\u937 ','Omega'), - (149,'α','\\u945 ','Alpha'), - (150,'β','\\u946 ','Beta'), - (151,'γ','\\u947 ','Gamma'), - (152,'δ','\\u948 ','Delta'), - (153,'ε','\\u949 ','Epsilon'), - (154,'ζ','\\u950 ','Zeta'), - (155,'η','\\u951 ','Eta'), - (156,'θ','\\u952 ','Theta'), - (157,'ι','\\u953 ','Iota'), - (158,'κ','\\u954 ','Kappa'), - (159,'λ','\\u955 ','Lambda'), - (160,'μ','\\u956 ','Mu'), - (161,'ν','\\u957 ','Nu'), - (162,'ξ','\\u958 ','Xi'), - (163,'ο','\\u959 ','Omicron'), - (164,'π','\\u960 ','Pi'), - (165,'ρ','\\u961 ','Rho'), - (166,'σ','\\u963 ','Sigma'), - (167,'ς','\\u962 ','Sigma'), - (168,'τ','\\u964 ','Tau'), - (169,'υ','\\u965 ','Upsilon'), - (170,'φ','\\u966 ','Phi'), - (171,'χ','\\u967 ','Chi'), - (172,'ψ','\\u968 ','Psi'), - (173,'ω','\\u969 ','Omega'), - (174,'®','\\\'ae','reserved'), - (175,'ü','\\\'fc','u umlaut'), - (176,'ö','\\\'f6','o umlaut'), - (177,'é','\\\'e9','accented e'), - (178,'ç','\\\'e7','french c'), - (179,'…','\\\'85','ellipsis'), - (180,'ó','\\\'f3','accented o'), - (181,'™','\\\'99','trademark'), - (182,'≤','\\u8804 ','less than or equal to'), - (183,'≥','\\u8805 ','greater than or equal to'), - (184,'à','\\\'e0','accented a'), - (185,'ï','\\\'ef','i umlaut'), - (186,'—','\\\'97','long dash'), - (187,'→','\\u8594 ','right arrow'), - (188,'←','\\u8592 ','left arrow'), - (189,'°','\\\'b0','degree'), - (190,'á','\\\'e1','accented a'), - (191,'†','\\\'86','cross'), - (192,'è','\\\'e8','accented e'), - (193,'ê','\\\'ea','weird e'), - (194,'ã','\\\'e3','a with tilde'), - (195,'ß','\\\'df','beta'), - (196,'ū','\\u363 ','u with tilde'), - (197,'‡','\\\'87','double dagger'), - (198,'©','\\\'a9','copyright'), - (199,'∆','\\u8710 ','delta'), - (200,'í','\\\'ed','accented i'), - (201,'’','\\\'92','apostrophe'), - (202,'ë','\\\'eb','e with umlaut'), - (203,'ñ','\\\'f1','n with tilde'), - (204,'±','\\\'b1','plus or minus'), - (205,'”','\\\'94','double quotes'), - (206,'×','\\\'d7','x'), - (207,'Å','\\\'c5','a with circle'), - (208,'↔','\\u8596 ','double arrow'), - (209,'ä','\\u228 ','a with umlaut'), - (210,'“','\\\'81\\\'67','double quotes'), - (211,'•','\\u8226 ','bullet'), - (212,'∗','\\u8727 ','star'), - (213,'{','\\{','left brace'), - (214,'}','\\}','right brace'), - (215,'¹','\\\'b9','superscript 1'), - (216,'²','\\\'b2','superscript 2'), - (217,'³','\\\'b3','superscript 3'), - (218,'⁴','\\u8308 ','superscript 4'), - (219,'⁵','\\u8309 ','superscript 5'), - (220,'⁶','\\u8310 ','superscript 6'), - (221,'⁷','\\u8311 ','superscript 7'), - (222,'⁸','\\u8312 ','superscript 8'), - (223,'⁹','\\u8313 ','superscript 9'), - (224,'⁰','\\u8304 ','superscript 0'), - (225,'₁','\\u8321 ','subscript 1'), - (226,'₂','\\u8322 ','subscript 2'), - (227,'₃','\\u8323 ','subscript 3'), - (228,'₄','\\u8324 ','subscript 4'), - (229,'₅','\\u8325 ','subscript 5'), - (230,'₆','\\u8326 ','subscript 6'), - (231,'₇','\\u8327 ','subscript 7'), - (232,'₈','\\u8328 ','subscript 8'), - (233,'₉','\\u8329 ','subscript 9'), - (234,'₀','\\u8320 ','subscript 0'), - (235,'~','\\u8764 ','tilde'), - (236,'⁺','\\u8314 ','superscript plus'), - (237,'✰','\\u10032 ','star'), - (238,'·','\\\'b7','dot'), - (239,'–','\\\'96','dash'), - (240,'∩','\\u8745 ','intersection'), - (241,'‑','\\u8209 ','dash'), - (242,'☆','\\u9734 ','star'), - (243,'ɛ','\\u603 ','backwards 3'), - (244,'ô','\\\'f4','o with hat'), - (245,'fi','\\u64257 ','fi or something'), - (246,'ĸ','\\u312 ','k or something'), - (247,'ń','\\u324 ','accented n'), - (248,'″','\\u8243 ','quote'), - (249,'⁻','\\u8315 ','dash or something'), - (250,'‒','\\u8210 ','dash or something'), - (251,'ů','\\u367 ','u circle'), - (252,'√','\\u8730 ','checkmark'), - (253,'‘','\\\'91','apostrophe'), - (254,'ø','\\\'f8','o with slash'), - (255,'ú','\\\'fa','accented u'), - (256,'č','\\u269 ','c with caret'), - (257,'ć','\\u263 ','c with accent'), - (258,'ğ','\\u287 ','g with caret'), - (259,'ā','\\u257 ','a with line on top'), - (260,'õ','\\\'f5','o with tilde'), - (261,'ś','\\u347 ','accented s'), - (262,'î','\\\'ee','i with caret on top'), - (263,'ş','\\u351 ','s with squiggle'), - (264,'Ş','\\u350 ','capital s with squiggle'), - (265,'ʼ','\\u700 ','apostrophe'), - (266,'â','\\\'e2','a with caret'), - (267,'ı','\\u305 ','little i'), - (268,'ė','\\u279 ','e with dot'), - (269,'ł','\\u322 ','I with slash'), - (270,'ą','\\u261 ','a with squiggle'), - (271,'ę','\\u281 ','french e'), - (272,'ĭ','\\u301 ','i with half circle'), - (273,'ň','\\u328 ','n with caret'), - (274,'İ','\\u304 ','i with dot on top'), - (275,'ě','\\u283 ','e with caret'), - (276,'ǧ','\\u487 ','g with caret'), - (277,'ő','\\u337 ','o with two accents'), - (278,'û','\\\'fb','u with caret'), - (279,'ý','\\\'fd','y with accent'), - (280,'ź','\\u378 ','z with accent'), - (281,'ż','\\u380 ','z with dot'), - (282,'ű','\\u369 ','u with two accents'), - (283,'ŏ','\\u335 ','o with half circle'), - (284,'ī','\\u299 ','i with line on top'), - (285,'ӧ','\\u1255 ','o with umlaut'), - (286,'Đ','\\u272 ','d with slash'), - (287,'ř','\\u345 ','r with caret'), - (288,'ˇ','\\u711 ','caret'), - (289,'ă','\\u259 ','a with caret'), - (290,'ŕ','\\u341 ','r with accent'), - (291,'ĕ','\\u277 ','e with caret'), - (292,'ό','\\u972 ','o with accent'), - (293,'ũ','\\u361 ','u with tilde'), - (294,'׳','\\\'d7','apostrophe'); -UNLOCK TABLES; - -LOCK TABLES `admin_settings` WRITE; -TRUNCATE `admin_settings`; -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('displayMessages', '[{"labelUserKey":"messages","helpTextSettingsView":"Controls the displying of the success or error messages","isVisible":true}]', 'Display Messages'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('EmailNotifications', '[{"labelUserKey":"emailNotifications","labelSettingsView":"Enable email notifications","helpTextSettingsView":"Check to enable for all users. Once enabled, users with the \\"curator_self role\\" and a registered email will have the option to \\"Manage notifications.\\"","isVisible":true},{"labelUserKey":"emailSender","labelSettingsView":"Email sender","helpTextSettingsView":"Specify the \\"from\\" email address for the notifications. You can use either of these formats \\"publications@med.cornell.edu\\" or \\"WCM Publications \\"","labelUserView":"publications@med.cornell.edu"},{"labelUserKey":"emailSalutation","labelSettingsView":"Email body: salutation","labelUserView":"","helpTextSettingsView":"This text is the greeting portion of the email notification."},{"labelUserKey":"acceptedSubjectHeader","labelSettingsView":"Email body: \\"Accepted\\" section prefix","helpTextSettingsView":"This text precedes the list of any publications that have been accepted on behalf of a given person.","labelUserView":"The following publications have been accepted on your behalf"},{"labelUserKey":"suggestedSubjectHeader","labelSettingsView":"Email body: \\"Suggested\\" section prefix","helpTextSettingsView":"This text precedes the list of any publications that have been suggested for a given person.","labelUserView":"The following publications are pending for you"},{"labelUserKey":"acceptedEmailNotificationsLimit","labelSettingsView":"Email body: max accepted articles","helpTextSettingsView":"Select the maximum number of accepted publications to display in an email. We recommend a limit of 10. Note that this section excludes publications that have been accepted by the user themselves.","maxLimit":"5"},{"labelUserKey":"suggestedEmailNotificationsLimit","labelSettingsView":"Email body: max suggested articles","helpTextSettingsView":"Select the maximum number of suggested publications to display in an email. We recommend a limit of 10.","maxLimit":"10"},{"labelUserKey":"emailSignature","labelUserView":"Sincerely,\\nSamuel J. Wood Library\\nWeill Cornell Medicine\\n","labelSettingsView":"Email signature","helpTextSettingsView":"Define the signature that will appear at the end of the email."},{"labelUserKey":"testemailfunctionality","labelSettingsView":"Test emailing functionality","helpTextSettingsView":"Here you can test the email user functionality by inputting a person identifier, an email address recipient, and then clicking on \\"Send test email\\". If the Email Override field is blank, users will be contacted at their email of record as stored in the admin_users table.If the Email Override checkbox is selected, all notification emails from any regularly scheduled job will be sent to the email address specified in this field.","personIdentifier":"acs2001","emailOverride":"reciter2024@med.cornell.edu","useEmailForScheduledJobs":true,"submitButton":""}]', 'Email Notifications'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('findPeople', '[{"labelUserKey":"personIdentifier","labelUserView":"CWID","labelSettingsView":"Label for person identifier","helpTextSettingsView":"e.g.,NetID, CWID. Used throughout the application. "},{"labelUserKey":"organization","labelUserView":"Organization(s)","labelSettingsView":"Label for organizational unit","helpTextSettingsView":"e.g., Pediatrics"},{"labelUserKey":"institution","labelUserView":"Institution(s)","labelSettingsView":"Label for institution","helpTextSettingsView":"e.g., Cornell University"},{"labelUserKey":"personType","labelUserView":"Person Type(s)","labelSettingsView":"Label for person type","helpTextSettingsView":"e.g., academic-faculty,student-phd"}]', 'Find People'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('headshot', '[{"labelUserKey":"headshot","labelUserView":"Headshot","labelSettingsView":"Headshot","helpTextSettingsView":"Include the full URL for a third party headshot API where a token a personIdentifier is enclosed by braces","syntax":"https://directory.weill.cornell.edu/api/v1/person/profile/{personIdentifier}.png?returnGenericOn404=false"}]', 'Headshot'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('reportingArticleCSV', '[{"labelUserKey":"reportingArticleCSVLimit","labelSettingsView":"Maximum records for export to article CSV","helpTextSettingsView":"","maxLimit":"500000"},{"labelUserKey":"pmid","labelUserView":"PMID","labelSettingsView":"PMID","helpTextSettingsView":"","displayRank":"21","isVisible":true},{"labelUserKey":"Article title","labelUserView":"Article title","labelSettingsView":"Article title","helpTextSettingsView":"","displayRank":"2","isVisible":true},{"labelUserKey":"Article year","labelUserView":"Article year","labelSettingsView":"Article year","helpTextSettingsView":"","displayRank":"3","isVisible":true},{"labelUserKey":"pmcid","labelUserView":"PMCID","labelSettingsView":"PMCID","helpTextSettingsView":"","displayRank":4,"isVisible":true},{"labelUserKey":"Publication date display","labelUserView":"Publication date display","labelSettingsView":"Publication date display","helpTextSettingsView":"","displayRank":5,"isVisible":true},{"labelUserKey":"Date standardized","labelUserView":"Publication date standardized","labelSettingsView":"Publication date standardized","helpTextSettingsView":"","displayRank":6,"isVisible":true},{"labelUserKey":"Date added","labelUserView":"Date added","labelSettingsView":"Date added","helpTextSettingsView":"","displayRank":7,"isVisible":true},{"labelUserKey":"Journal","labelUserView":"Journal Title Verbose","labelSettingsView":"Journal Title Verbose","helpTextSettingsView":"","displayRank":8,"isVisible":true},{"labelUserKey":"doi","labelUserView":"DOI","labelSettingsView":"DOI","helpTextSettingsView":"","displayRank":9,"isVisible":true},{"labelUserKey":"Issue","labelUserView":"Issue","labelSettingsView":"Issue","helpTextSettingsView":"","displayRank":10,"isVisible":true},{"labelUserKey":"Pages","labelUserView":"Pages","labelSettingsView":"Pages","helpTextSettingsView":"","displayRank":11,"isVisible":true},{"labelUserKey":"Volume","labelUserView":"Volume","labelSettingsView":"Volume","helpTextSettingsView":"","displayRank":12,"isVisible":true},{"labelUserKey":"Scimago Journal Rank","labelUserView":"Scimago Journal Rank","labelSettingsView":"Scimago Journal Rank","helpTextSettingsView":"","displayRank":13,"isVisible":true},{"labelUserKey":"Mendeley readers","labelUserView":"Mendeley readers","labelSettingsView":"Mendeley readers","helpTextSettingsView":"","displayRank":14,"isVisible":true},{"labelUserKey":"NIH Relative Citation Ratio","labelUserView":"NIH Relative Citation Ratio","labelSettingsView":"NIH Relative Citation Ratio","helpTextSettingsView":"","displayRank":15,"isVisible":true},{"labelUserKey":"NIH Percentile Rank","labelUserView":"NIH Percentile Rank","labelSettingsView":"NIH Percentile Rank","helpTextSettingsView":"","displayRank":16,"isVisible":true},{"labelUserKey":"TrendingPubs score","labelUserView":"TrendingPubs score","labelSettingsView":"TrendingPubs score","helpTextSettingsView":"","displayRank":17,"isVisible":false},{"labelUserKey":"Citation count (NIH)","labelUserView":"Citation count (NIH)","labelSettingsView":"Citation count (NIH)","helpTextSettingsView":"","displayRank":18,"isVisible":true},{"labelUserKey":"Citation count (Scopus)","labelUserView":"Citation count (Scopus)","labelSettingsView":"Citation count (Scopus)","helpTextSettingsView":"","displayRank":19,"isVisible":false},{"labelUserKey":"Authors","labelUserView":"Author(s)","labelSettingsView":"Author(s)","helpTextSettingsView":"","displayRank":"1","isVisible":true},{"labelUserKey":"Journal Metric","labelUserView":"Journal Metric","labelSettingsView":"Journal Metric","helpTextSettingsView":"An additional journal-level metric","displayRank":"20","isVisible":true},{"labelUserKey":"Author Position","labelUserView":"Author Position","labelSettingsView":"Author Position","helpTextSettingsView":"Author Position","displayRank":"21","isVisible":true}]', 'Reporting Article CSV'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('reportingArticleRTF', '[{"labelUserKey":"reportingArticleRTFLimit","labelSettingsView":"Maximum records for export to article RTF","helpTextSettingsView":"Maximum number of article records a user can export to RTF. In testing, we have found the export fails after 40,000 records. ","maxLimit":"1000","isValidate":false,"errorMessage":"Limit cannot exceed 40000"}]', 'Reporting Article RTF'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('reportingAuthorshipCSV', '[{"labelUserKey":"reportingAuthorshipCSVLimit","labelSettingsView":"Maximum records for export to article CSV","helpTextSettingsView":"","maxLimit":"500000"},{"labelUserKey":"personIdentifier","labelUserView":"PersonIdentifier","labelSettingsView":"CWID","helpTextSettingsView":"","displayRank":"1","isVisible":false},{"labelUserKey":"Last Name","labelUserView":"Last Name","labelSettingsView":"Last Name","helpTextSettingsView":"","displayRank":"2","isVisible":false},{"labelUserKey":"First Name","labelUserView":"First Name","labelSettingsView":"First Name","helpTextSettingsView":"","displayRank":"3","isVisible":false},{"labelUserKey":"Organization","labelUserView":"Organization","labelSettingsView":"Organization","helpTextSettingsView":"","displayRank":"4","isVisible":true},{"labelUserKey":"Institution","labelUserView":"Institution","labelSettingsView":"Institution","helpTextSettingsView":"","displayRank":"5","isVisible":true},{"labelUserKey":"Author Position","labelUserView":"Author Position","labelSettingsView":"Author Position","helpTextSettingsView":"Author Position","displayRank":"6","isVisible":true},{"labelUserKey":"pmid","labelUserView":"PMID","labelSettingsView":"PMID","helpTextSettingsView":"","displayRank":7,"isVisible":true},{"labelUserKey":"Article title","labelUserView":"Article title","labelSettingsView":"Article title","helpTextSettingsView":"","displayRank":8,"isVisible":true},{"labelUserKey":"Article year","labelUserView":"Article year","labelSettingsView":"Article year","helpTextSettingsView":"","displayRank":9,"isVisible":true},{"labelUserKey":"pmcid","labelUserView":"PMCID","labelSettingsView":"PMCID","helpTextSettingsView":"","displayRank":10,"isVisible":true},{"labelUserKey":"Publication date display","labelUserView":"Publication date display","labelSettingsView":"Publication date display","helpTextSettingsView":"","displayRank":11,"isVisible":true},{"labelUserKey":"Date standardized","labelUserView":"Date standardized","labelSettingsView":"Publication date standardized","helpTextSettingsView":"","displayRank":12,"isVisible":true},{"labelUserKey":"Date added","labelUserView":"Date added","labelSettingsView":"Date added","helpTextSettingsView":"","displayRank":13,"isVisible":true},{"labelUserKey":"Journal","labelUserView":"Journal title","labelSettingsView":"Journal title","helpTextSettingsView":"","displayRank":14,"isVisible":true},{"labelUserKey":"doi","labelUserView":"DOI","labelSettingsView":"DOI","helpTextSettingsView":"","displayRank":15,"isVisible":true},{"labelUserKey":"Issue","labelUserView":"Issue","labelSettingsView":"Issue","helpTextSettingsView":"","displayRank":16,"isVisible":true},{"labelUserKey":"Pages","labelUserView":"Pages","labelSettingsView":"Pages","helpTextSettingsView":"","displayRank":17,"isVisible":true},{"labelUserKey":"Volume","labelUserView":"Volume","labelSettingsView":"Volume","helpTextSettingsView":"","displayRank":18,"isVisible":true},{"labelUserKey":"Scimago Journal Rank","labelUserView":"Scimago Journal Rank","labelSettingsView":"Scimago Journal Rank","helpTextSettingsView":"","displayRank":19,"isVisible":true},{"labelUserKey":"Mendeley readers","labelUserView":"Mendeley readers","labelSettingsView":"Mendeley readers","helpTextSettingsView":"","displayRank":20,"isVisible":true},{"labelUserKey":"NIH Relative Citation Ratio","labelUserView":"NIH Relative Citation Ratio","labelSettingsView":"NIH Relative Citation Ratio","helpTextSettingsView":"","displayRank":21,"isVisible":true},{"labelUserKey":"NIH Percentile Rank","labelUserView":"NIH Percentile Rank","labelSettingsView":"NIH Percentile Rank","helpTextSettingsView":"","displayRank":22,"isVisible":true},{"labelUserKey":"TrendingPubs score","labelUserView":"TrendingPubs score","labelSettingsView":"TrendingPubs score","helpTextSettingsView":"","displayRank":23,"isVisible":false},{"labelUserKey":"Citation count (NIH)","labelUserView":"Citation count (NIH)","labelSettingsView":"Citation count (NIH) ","helpTextSettingsView":"","displayRank":24,"isVisible":true},{"labelUserKey":"Citation count (Scopus)","labelUserView":"Citation count (Scopus) ","labelSettingsView":"Citation count (Scopus) ","helpTextSettingsView":"","displayRank":25,"isVisible":false},{"labelUserKey":"Person types","labelUserView":"Person type(s)","labelSettingsView":"Person type(s)","helpTextSettingsView":"","displayRank":26,"isVisible":true},{"labelUserKey":"Authors","labelUserView":"Author(s)","labelSettingsView":"Author(s)","helpTextSettingsView":"","displayRank":27,"isVisible":true},{"labelUserKey":"Journal","labelUserView":"Journal Title Verbose","labelSettingsView":"Journal Title Verbose","helpTextSettingsView":"","displayRank":28,"isVisible":true},{"labelUserKey":"Journal Metric","labelUserView":"Journal Metric","labelSettingsView":"Journal Metric","helpTextSettingsView":"An additional journal-level metric","displayRank":"20","isVisible":true}]', 'Reporting Authorship CSV'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('reportingFilters', '[{"labelUserKey":"Author","labelUserView":"Author","labelSettingsView":"Author","helpTextSettingsView":""},{"labelUserKey":"Organization","labelUserView":"Organization","labelSettingsView":"Organization","helpTextSettingsView":""},{"labelUserKey":"Institution","labelUserView":"Institution","labelSettingsView":"Institution","helpTextSettingsView":""},{"labelUserKey":"Person Type","labelUserView":"Person Type","labelSettingsView":"Person Type","helpTextSettingsView":""},{"labelUserKey":"Author Position","labelUserView":"Author Position","labelSettingsView":"Author Position","helpTextSettingsView":""},{"labelUserKey":"Date","labelUserView":"Date","labelSettingsView":"Date","helpTextSettingsView":""},{"labelUserKey":"Type","labelUserView":"Type","labelSettingsView":"Type","helpTextSettingsView":""},{"labelUserKey":"Journal","labelUserView":"Journal","labelSettingsView":"Journal","helpTextSettingsView":""},{"labelUserKey":"Journal Rank","labelUserView":"Journal Rank","labelSettingsView":"Journal Rank","helpTextSettingsView":""}]', 'Reporting Filters'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('reportingWebDisplay', '[{"labelUserKey":"Citation count (NIH)","labelUserView":"Citation count (NIH)","labelSettingsView":"Citation count (NIH)","helpTextUserView":"This is the test number of citations an article has received from CrossRef, MEDLINE, PubMed Central, and Entrez. NIH citation counts generally correlate closely with the counts in Scopus and Web of Knowledge. ","helpTextSettingsView":"","displayRank":"3","isVisible":false},{"labelUserKey":"Citation count (Scopus)","labelUserView":"Citation count (Scopus)","labelSettingsView":"Citation count (Scopus)","helpTextUserView":"","helpTextSettingsView":"","displayRank":"1","isVisible":true},{"labelUserKey":"Percentile Rank","labelUserView":"Percentile Rank","labelSettingsView":"NIH Percentile Rank","helpTextUserView":"NIH percentile is the value of RCR provided as a percentile in which 100 is the highest and 0 is the lowest. For example, if an article has an NIH percentile of 63.2, it has received more citations than 631 articles when measured against a field and time-weighted benchmark of 1,000 NIH-funded research articles from the same year. A percentile is not computed for an article published in the past two years.","helpTextSettingsView":"","displayRank":"2","isVisible":true},{"labelUserKey":"Relative Citation Ratio (NIH)","labelUserView":"Relative Citation Ratio (NIH)","labelSettingsView":"Relative Citation Ratio (NIH)","helpTextSettingsView":"","helpTextUserView":"Relative Citation Ratio (RCR) is the ratio between the number of times an article was cited in comparison to publications of the same date and field (as inferred by co-citation networks). A value of 1.0 is the median. Higher is better. The benchmark consists of research articles that are the product of R01 grants, the NIH''''s most prestigious and competitive funding mechanism.","displayRank":4,"isVisible":true},{"labelUserKey":"Journal Rank","labelUserView":"Journal Rank","labelSettingsView":"Scimago Journal Rank","helpTextUserView":"SCImago Journal Rank is a measure of the relative number of inbound citations articles in a given journal receive compared to outbound citations. It is closely correlated with Journal Impact Factor.","helpTextSettingsView":"Journal Rank Help text","displayRank":"5","isVisible":true},{"labelUserKey":"Journal Metric","labelUserView":"Journal Impact","labelSettingsView":"Journal Metric","helpTextUserView":"","helpTextSettingsView":"","displayRank":6,"isVisible":true},{"labelUserKey":"readersMendeley","labelUserView":"Mendeley readers","labelSettingsView":"Mendeley readers","helpTextUserView":"","helpTextSettingsView":"","displayRank":7,"isVisible":false},{"labelUserKey":"TrendingPubs score","labelUserView":"TrendingPubs score","labelSettingsView":"TrendingPubs score","helpTextUserView":"","helpTextSettingsView":"","displayRank":8,"isVisible":false}]', 'Reporting Web Display'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('reportingWebViewSort', '[{"labelUserKey":"datePublicationAddedToEntrez","labelUserView":"Date added","labelSettingsView":"Date added","helpTextSettingsView":"","displayRank":"2","isVisible":true},{"labelUserKey":"citationCountNIH","labelUserView":"Citation count (NIH)","labelSettingsView":"Citation count (NIH)","displayRank":"1","helpTextSettingsView":"","isVisible":true},{"labelUserKey":"citationCountScopus","labelUserView":"Citation count (Scopus)","labelSettingsView":"Citation count (Scopus)","helpTextSettingsView":"","displayRank":"3","isVisible":true},{"labelUserKey":"journalImpactScore1","labelUserView":"Scimago Journal Rank","labelSettingsView":"Scimago Journal Rank","helpTextSettingsView":"","isVisible":true,"displayRank":"4"},{"labelUserKey":"journalImpactScore2","labelUserView":"Journal Metric","labelSettingsView":"Journal Metric","helpTextSettingsView":"","displayRank":"5","isVisible":true},{"labelUserKey":"readersMendeley","labelUserView":"Mendeley readers","labelSettingsView":"Mendeley readers","helpTextSettingsView":"","isVisible":false,"displayRank":"6"},{"labelUserKey":"percentileNIH","labelUserView":"NIH Percentile Rank","labelSettingsView":"NIH Percentile Rank","helpTextSettingsView":"","isVisible":true,"displayRank":"7"},{"labelUserKey":"relativeCitationRatioNIH","labelUserView":"Relative Citation Ratio (NIH)","labelSettingsView":"Relative Citation Ratio (NIH)","helpTextSettingsView":"","isVisible":true,"displayRank":"8"},{"labelUserKey":"trendingPubsScore","labelUserView":"TrendingPubs score","labelSettingsView":"TrendingPubs score","helpTextSettingsView":"","isVisible":false,"displayRank":"9"}]', 'Reporting Web View Sort'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('userRoles', '[{"labelUserKey":"Roles````","labelSettingsView":"Select the role","inputType":"check","isRoleGroup":false,"roles":[{"roleId":3,"roleName":"Repoter_All","roleLabel":"Automatically grant all successfully authenticated users the reporter_all role.","isChecked":true},{"roleId":2,"roleName":"Curator_All","roleLabel":"Automatically grant all successfully authenticated users the curator_all role","isChecked":false}]}]', 'User Default Role(s)'); -INSERT INTO `admin_settings` -(`viewName`, `viewAttributes`, `viewLabel`) -VALUES('viewProfile', '[{"labelUserKey":"h-index","labelUserView":"h-index","labelSettingsView":"h-index (NIH)","helpTextUserView":"h-index is the number of an author''s articles in PubMed that have been cited, as defined by NIH''s iCite service, at least that many times. ","helpTextSettingsView":""},{"labelUserKey":"h5-index","labelUserView":"h5-index","labelSettingsView":"h5-index (NIH)","helpTextUserView":"h5-index is the number of an author''s articles in PubMed that have been cited, as defined by NIH''s iCite service, at least that many times within the past 5 years. ","helpTextSettingsView":""},{"labelUserKey":"hindexScopus","labelUserView":"h-index (Scopus)","labelSettingsView":"h-index (Scopus)","helpTextUserView":"h-index is the number of an author''s articles in Scopus that have been cited, as defined by Scopus, at least that many times.","helpTextSettingsView":""},{"labelUserKey":"h5IndexScopus","labelUserView":"h5-index (Scopus)","labelSettingsView":"h5-index (Scopus)","helpTextUserView":"h-index is the number of an author''s articles in Scopus published in the last five years that have been cited, as defined by Scopus, at least that many times.","helpTextSettingsView":""}]', 'View Profile'); -UNLOCK TABLES; diff --git a/setup/table_admin_roles.sql b/setup/table_admin_roles.sql new file mode 100644 index 0000000..50dcf8b --- /dev/null +++ b/setup/table_admin_roles.sql @@ -0,0 +1,9 @@ +LOCK TABLES `admin_roles` WRITE; +INSERT INTO `admin_roles` (`roleID`, `roleLabel`) VALUES + (1,'Superuser'), + (2,'Curator_All'), + (3,'Reporter_All'), + (4,'Curator_Self'), + (5,'Curator_Department'), + (6,'Curator_Department_Delegate'); +UNLOCK TABLES; \ No newline at end of file diff --git a/update/temp/.gitkeep b/update/temp/.gitkeep deleted file mode 100644 index e69de29..0000000 From 413043cd0c2c918bc85bfaba21d4ae66691188bd Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Thu, 26 Mar 2026 16:27:33 -0400 Subject: [PATCH 03/19] fix(percentile): replace broken percentile/rank/denominator logic in standalone SP STEP 4: Replace person.title with person_person_type LEFT JOIN chain (COALESCE) for facultyRank derivation. Replace article counts that counted all articles with filtered counts (Research Article + percentileNIH IS NOT NULL only). STEP 5: Replace all 8 combined UPDATE statements (using wrong threshold-percentage formula) with 24 separate UPDATE statements matching the canonical createEventsProceduresReciterDb.sql v2: - Percentile: AVG of top N articles ranked by percentileNIH DESC - Denominator: Count of peers at same facultyRank meeting thresholds - Rank: RANK() OVER (PARTITION BY facultyRank ORDER BY percentile DESC) All 8 metrics (top5/10 x All/First/Senior/FirstSenior) now match the canonical logic. Steps 1-3, 6, 7 verified identical and unchanged. Ref: createEventsProceduresReciterDb.sql lines 3720-4252 --- setup/populateAnalysisSummaryTables_v2.sql | 600 ++++++++++++++++----- 1 file changed, 460 insertions(+), 140 deletions(-) diff --git a/setup/populateAnalysisSummaryTables_v2.sql b/setup/populateAnalysisSummaryTables_v2.sql index de2bfbd..b8807e5 100644 --- a/setup/populateAnalysisSummaryTables_v2.sql +++ b/setup/populateAnalysisSummaryTables_v2.sql @@ -564,53 +564,106 @@ proc_main: BEGIN CALL log_progress(v_job_id, v_step, 'Complete', 'DONE', NULL, CONCAT(TIMESTAMPDIFF(SECOND, v_start_time, NOW()), 's elapsed')); + -- ======================================================================== -- STEP 4: Populate analysis_summary_person_new -- ======================================================================== SET v_step = '4. Populate analysis_summary_person'; CALL log_progress(v_job_id, v_step, 'Inserting person records', 'RUNNING', NULL, NULL); - INSERT INTO analysis_summary_person_new (personIdentifier, nameFirst, nameMiddle, nameLast, facultyRank, department) - SELECT DISTINCT - p.personIdentifier, - p.firstName, - p.middleName, - p.lastName, - p.title, - p.primaryOrganizationalUnit - FROM person p - JOIN analysis_summary_person_scope s ON s.personIdentifier = p.personIdentifier; + -- Populate using person_person_type to derive facultyRank + INSERT INTO analysis_summary_person_new (personIdentifier, nameFirst, nameMiddle, nameLast, department, facultyRank) + SELECT * FROM ( + SELECT DISTINCT + p.personIdentifier, + p.firstName AS nameFirst, + p.middleName AS nameMiddle, + p.lastName AS nameLast, + p.primaryOrganizationalUnit AS department, + COALESCE(a.facultyRank, b.facultyRank, c.facultyRank, d.facultyRank) AS facultyRank + FROM person p + + LEFT JOIN ( + SELECT personIdentifier, 'Full Professor' AS facultyRank + FROM person_person_type + WHERE personType = 'academic-faculty-fullprofessor' + ) a ON a.personIdentifier = p.personIdentifier + + LEFT JOIN ( + SELECT personIdentifier, 'Associate Professor' AS facultyRank + FROM person_person_type + WHERE personType = 'academic-faculty-associate' + ) b ON b.personIdentifier = p.personIdentifier + + LEFT JOIN ( + SELECT personIdentifier, 'Assistant Professor' AS facultyRank + FROM person_person_type + WHERE personType = 'academic-faculty-assistant' + ) c ON c.personIdentifier = p.personIdentifier + + LEFT JOIN ( + SELECT personIdentifier, 'Instructor or Lecturer' AS facultyRank + FROM person_person_type + WHERE personType IN ('academic-faculty-instructor', 'academic-faculty-lecturer') + ) d ON d.personIdentifier = p.personIdentifier + + INNER JOIN analysis_summary_person_scope e ON e.personIdentifier = p.personIdentifier + ) x + WHERE facultyRank IS NOT NULL; SET v_rows = ROW_COUNT(); CALL log_progress(v_job_id, v_step, 'Inserted person records', 'INFO', v_rows, NULL); - -- Update article counts + -- ======================================================================== + -- STEP 4b: Compute article counts + -- Counts are for articles with publicationTypeNIH = 'Research Article' + -- and percentileNIH is not null + -- ======================================================================== CALL log_progress(v_job_id, v_step, 'Updating article counts', 'RUNNING', NULL, NULL); + + -- countAll: Count of research articles with NIH percentile UPDATE analysis_summary_person_new p JOIN ( - SELECT personIdentifier, COUNT(DISTINCT pmid) AS cnt - FROM analysis_summary_author_new - GROUP BY personIdentifier - ) c ON c.personIdentifier = p.personIdentifier - SET p.countAll = c.cnt; + SELECT s.personIdentifier, COUNT(a1.pmid) AS count + FROM analysis_summary_person_new s + JOIN analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE publicationTypeNIH = 'Research Article' AND percentileNIH IS NOT NULL + GROUP BY s.personIdentifier + ) x ON x.personIdentifier = p.personIdentifier + SET p.countAll = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated countAll', 'INFO', v_rows, NULL); + -- countFirst: Count of first-authored research articles with NIH percentile UPDATE analysis_summary_person_new p JOIN ( - SELECT personIdentifier, COUNT(DISTINCT pmid) AS cnt - FROM analysis_summary_author_new - WHERE authorPosition = 'first' - GROUP BY personIdentifier - ) c ON c.personIdentifier = p.personIdentifier - SET p.countFirst = c.cnt; + SELECT s.personIdentifier, COUNT(a1.pmid) AS count + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE publicationTypeNIH = 'Research Article' AND percentileNIH IS NOT NULL + AND a.authorPosition = 'first' + GROUP BY s.personIdentifier + ) x ON x.personIdentifier = p.personIdentifier + SET p.countFirst = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated countFirst', 'INFO', v_rows, NULL); + -- countSenior: Count of senior/last-authored research articles with NIH percentile UPDATE analysis_summary_person_new p JOIN ( - SELECT personIdentifier, COUNT(DISTINCT pmid) AS cnt - FROM analysis_summary_author_new - WHERE authorPosition = 'last' - GROUP BY personIdentifier - ) c ON c.personIdentifier = p.personIdentifier - SET p.countSenior = c.cnt; + SELECT s.personIdentifier, COUNT(a1.pmid) AS count + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE publicationTypeNIH = 'Research Article' AND percentileNIH IS NOT NULL + AND a.authorPosition = 'last' + GROUP BY s.personIdentifier + ) x ON x.personIdentifier = p.personIdentifier + SET p.countSenior = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated countSenior', 'INFO', v_rows, NULL); IF v_error_occurred THEN CALL log_progress(v_job_id, v_step, 'Failed', 'ERROR', NULL, v_error_message); @@ -621,152 +674,419 @@ proc_main: BEGIN CALL log_progress(v_job_id, v_step, 'Complete', 'DONE', NULL, CONCAT(TIMESTAMPDIFF(SECOND, v_start_time, NOW()), 's elapsed')); -- ======================================================================== - -- STEP 5: Compute percentile rankings (with rank and denominator) + -- STEP 5: Compute percentile rankings (peer-based) + -- Percentile = average of top N articles by percentileNIH + -- Denominator = count of people with same facultyRank who have the metric + -- Rank = rank within facultyRank by percentile value -- ======================================================================== SET v_step = '5. Compute percentile rankings'; - CALL log_progress(v_job_id, v_step, 'Computing percentiles (8 metrics with rank/denominator)', 'RUNNING', NULL, NULL); + CALL log_progress(v_job_id, v_step, 'Computing percentiles (peer-based avg of top N)', 'RUNNING', NULL, NULL); + + -- ======================================================================== + -- 5a. TOP 5 PERCENTILE - ALL POSITIONS + -- ======================================================================== - -- Top 5 percentile, first/last authored + -- top5PercentileAll: Average of top 5 percentiles (requires countAll > 4) UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - AND authorPosition IN ('first', 'last') - GROUP BY a.personIdentifier + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + JOIN analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL AND s.countAll > 4 + ) y + WHERE article_rank < 6 + GROUP BY personIdentifier ) x ON x.personIdentifier = p.personIdentifier - SET p.top5PercentileFirstSenior = x.pct, - p.top5RankFirstSenior = x.rank_count, - p.top5DenominatorFirstSenior = x.denominator; + SET p.top5PercentileAll = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5PercentileAll', 'INFO', v_rows, NULL); - -- Top 10 percentile, first/last authored + -- top5DenominatorAll: Count of people in same facultyRank with valid top5PercentileAll UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - AND authorPosition IN ('first', 'last') - GROUP BY a.personIdentifier + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top5PercentileAll IS NOT NULL AND countAll > 4 + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top5DenominatorAll = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5DenominatorAll', 'INFO', v_rows, NULL); + + -- top5RankAll: Rank within facultyRank by top5PercentileAll + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top5PercentileAll DESC) AS personRank + FROM analysis_summary_person_new + WHERE countAll > 4 ) x ON x.personIdentifier = p.personIdentifier - SET p.top10PercentileFirstSenior = x.pct, - p.top10RankFirstSenior = x.rank_count, - p.top10DenominatorFirstSenior = x.denominator; + SET p.top5RankAll = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5RankAll', 'INFO', v_rows, NULL); + + -- ======================================================================== + -- 5b. TOP 10 PERCENTILE - ALL POSITIONS + -- ======================================================================== - -- Top 5 percentile, first authored only + -- top10PercentileAll: Average of top 10 percentiles (requires countAll > 9) UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - AND authorPosition = 'first' - GROUP BY a.personIdentifier + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + JOIN analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL AND s.countAll > 9 + ) y + WHERE article_rank < 11 + GROUP BY personIdentifier ) x ON x.personIdentifier = p.personIdentifier - SET p.top5PercentileFirst = x.pct, - p.top5RankFirst = x.rank_count, - p.top5DenominatorFirst = x.denominator; + SET p.top10PercentileAll = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10PercentileAll', 'INFO', v_rows, NULL); - -- Top 10 percentile, first authored only + -- top10DenominatorAll: Count of people in same facultyRank with valid top10PercentileAll UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - AND authorPosition = 'first' - GROUP BY a.personIdentifier + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top10PercentileAll IS NOT NULL AND countAll > 9 + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top10DenominatorAll = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10DenominatorAll', 'INFO', v_rows, NULL); + + -- top10RankAll: Rank within facultyRank by top10PercentileAll + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top10PercentileAll DESC) AS personRank + FROM analysis_summary_person_new + WHERE countAll > 9 ) x ON x.personIdentifier = p.personIdentifier - SET p.top10PercentileFirst = x.pct, - p.top10RankFirst = x.rank_count, - p.top10DenominatorFirst = x.denominator; + SET p.top10RankAll = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10RankAll', 'INFO', v_rows, NULL); + + -- ======================================================================== + -- 5c. TOP 5 PERCENTILE - FIRST AUTHOR ONLY + -- ======================================================================== - -- Top 5 percentile, last authored only + -- top5PercentileFirst: Average of top 5 percentiles for first-authored (requires countFirst > 4) UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - AND authorPosition = 'last' - GROUP BY a.personIdentifier + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL AND s.countFirst > 4 + AND a.authorPosition = 'first' + ) y + WHERE article_rank < 6 + GROUP BY personIdentifier ) x ON x.personIdentifier = p.personIdentifier - SET p.top5PercentileSenior = x.pct, - p.top5RankSenior = x.rank_count, - p.top5DenominatorSenior = x.denominator; + SET p.top5PercentileFirst = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5PercentileFirst', 'INFO', v_rows, NULL); - -- Top 10 percentile, last authored only + -- top5DenominatorFirst UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - AND authorPosition = 'last' - GROUP BY a.personIdentifier + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top5PercentileFirst IS NOT NULL AND countFirst > 4 + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top5DenominatorFirst = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5DenominatorFirst', 'INFO', v_rows, NULL); + + -- top5RankFirst + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top5PercentileFirst DESC) AS personRank + FROM analysis_summary_person_new + WHERE countFirst > 4 ) x ON x.personIdentifier = p.personIdentifier - SET p.top10PercentileSenior = x.pct, - p.top10RankSenior = x.rank_count, - p.top10DenominatorSenior = x.denominator; + SET p.top5RankFirst = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5RankFirst', 'INFO', v_rows, NULL); - -- Top 5 percentile, all positions + -- ======================================================================== + -- 5d. TOP 10 PERCENTILE - FIRST AUTHOR ONLY + -- ======================================================================== + + -- top10PercentileFirst: Average of top 10 percentiles for first-authored (requires countFirst > 9) UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 95 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - GROUP BY a.personIdentifier + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL AND s.countFirst > 9 + AND a.authorPosition = 'first' + ) y + WHERE article_rank < 11 + GROUP BY personIdentifier ) x ON x.personIdentifier = p.personIdentifier - SET p.top5PercentileAll = x.pct, - p.top5RankAll = x.rank_count, - p.top5DenominatorAll = x.denominator; + SET p.top10PercentileFirst = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10PercentileFirst', 'INFO', v_rows, NULL); - -- Top 10 percentile, all positions + -- top10DenominatorFirst UPDATE analysis_summary_person_new p JOIN ( - SELECT - a.personIdentifier, - ROUND(100 * SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) / COUNT(*), 1) AS pct, - SUM(CASE WHEN percentileNIH >= 90 THEN 1 ELSE 0 END) AS rank_count, - COUNT(*) AS denominator - FROM analysis_summary_author_new a - JOIN analysis_summary_article_new a1 ON a1.pmid = a.pmid - WHERE percentileNIH IS NOT NULL - GROUP BY a.personIdentifier + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top10PercentileFirst IS NOT NULL AND countFirst > 9 + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top10DenominatorFirst = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10DenominatorFirst', 'INFO', v_rows, NULL); + + -- top10RankFirst + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top10PercentileFirst DESC) AS personRank + FROM analysis_summary_person_new + WHERE countFirst > 9 + ) x ON x.personIdentifier = p.personIdentifier + SET p.top10RankFirst = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10RankFirst', 'INFO', v_rows, NULL); + + -- ======================================================================== + -- 5e. TOP 5 PERCENTILE - SENIOR/LAST AUTHOR ONLY + -- ======================================================================== + + -- top5PercentileSenior: Average of top 5 percentiles for last-authored (requires countSenior > 4) + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL AND s.countSenior > 4 + AND a.authorPosition = 'last' + ) y + WHERE article_rank < 6 + GROUP BY personIdentifier + ) x ON x.personIdentifier = p.personIdentifier + SET p.top5PercentileSenior = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5PercentileSenior', 'INFO', v_rows, NULL); + + -- top5DenominatorSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top5PercentileSenior IS NOT NULL AND countSenior > 4 + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top5DenominatorSenior = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5DenominatorSenior', 'INFO', v_rows, NULL); + + -- top5RankSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top5PercentileSenior DESC) AS personRank + FROM analysis_summary_person_new + WHERE countSenior > 4 + ) x ON x.personIdentifier = p.personIdentifier + SET p.top5RankSenior = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5RankSenior', 'INFO', v_rows, NULL); + + -- ======================================================================== + -- 5f. TOP 10 PERCENTILE - SENIOR/LAST AUTHOR ONLY + -- ======================================================================== + + -- top10PercentileSenior: Average of top 10 percentiles for last-authored (requires countSenior > 9) + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL AND s.countSenior > 9 + AND a.authorPosition = 'last' + ) y + WHERE article_rank < 11 + GROUP BY personIdentifier + ) x ON x.personIdentifier = p.personIdentifier + SET p.top10PercentileSenior = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10PercentileSenior', 'INFO', v_rows, NULL); + + -- top10DenominatorSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top10PercentileSenior IS NOT NULL AND countSenior > 9 + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top10DenominatorSenior = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10DenominatorSenior', 'INFO', v_rows, NULL); + + -- top10RankSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top10PercentileSenior DESC) AS personRank + FROM analysis_summary_person_new + WHERE countSenior > 9 + ) x ON x.personIdentifier = p.personIdentifier + SET p.top10RankSenior = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10RankSenior', 'INFO', v_rows, NULL); + + -- ======================================================================== + -- 5g. TOP 5 PERCENTILE - FIRST OR SENIOR (combined) + -- Note: countFirstSenior is computed inline since column doesn't exist + -- ======================================================================== + + -- top5PercentileFirstSenior: Average of top 5 percentiles for first/last authored + -- Requires at least 5 first+last authored articles with percentileNIH + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL + AND a.authorPosition IN ('first', 'last') + AND s.personIdentifier IN ( + -- Only include people with > 4 first/last articles + SELECT s2.personIdentifier + FROM analysis_summary_person_new s2 + Join analysis_summary_author_new a2 ON a2.personIdentifier = s2.personIdentifier + Join analysis_summary_article_new a12 ON a12.pmid = a2.pmid + WHERE a12.publicationTypeNIH = 'Research Article' AND a12.percentileNIH IS NOT NULL + AND a2.authorPosition IN ('first', 'last') + GROUP BY s2.personIdentifier + HAVING COUNT(*) > 4 + ) + ) y + WHERE article_rank < 6 + GROUP BY personIdentifier + ) x ON x.personIdentifier = p.personIdentifier + SET p.top5PercentileFirstSenior = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5PercentileFirstSenior', 'INFO', v_rows, NULL); + + -- top5DenominatorFirstSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top5PercentileFirstSenior IS NOT NULL + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top5DenominatorFirstSenior = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5DenominatorFirstSenior', 'INFO', v_rows, NULL); + + -- top5RankFirstSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top5PercentileFirstSenior DESC) AS personRank + FROM analysis_summary_person_new + WHERE top5PercentileFirstSenior IS NOT NULL + ) x ON x.personIdentifier = p.personIdentifier + SET p.top5RankFirstSenior = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top5RankFirstSenior', 'INFO', v_rows, NULL); + + -- ======================================================================== + -- 5h. TOP 10 PERCENTILE - FIRST OR SENIOR (combined) + -- ======================================================================== + + -- top10PercentileFirstSenior (requires > 9 first/last articles) + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, ROUND(AVG(percentileNIH), 3) AS percentileNIH + FROM ( + SELECT s.personIdentifier, a1.pmid, a1.percentileNIH, + RANK() OVER (PARTITION BY s.personIdentifier ORDER BY a1.percentileNIH DESC) AS article_rank + FROM analysis_summary_person_new s + Join analysis_summary_author_new a ON a.personIdentifier = s.personIdentifier + Join analysis_summary_article_new a1 ON a1.pmid = a.pmid + WHERE a1.percentileNIH IS NOT NULL + AND a.authorPosition IN ('first', 'last') + AND s.personIdentifier IN ( + -- Only include people with > 9 first/last articles + SELECT s2.personIdentifier + FROM analysis_summary_person_new s2 + Join analysis_summary_author_new a2 ON a2.personIdentifier = s2.personIdentifier + Join analysis_summary_article_new a12 ON a12.pmid = a2.pmid + WHERE a12.publicationTypeNIH = 'Research Article' AND a12.percentileNIH IS NOT NULL + AND a2.authorPosition IN ('first', 'last') + GROUP BY s2.personIdentifier + HAVING COUNT(*) > 9 + ) + ) y + WHERE article_rank < 11 + GROUP BY personIdentifier + ) x ON x.personIdentifier = p.personIdentifier + SET p.top10PercentileFirstSenior = x.percentileNIH; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10PercentileFirstSenior', 'INFO', v_rows, NULL); + + -- top10DenominatorFirstSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT COUNT(*) AS count, facultyRank + FROM analysis_summary_person_new + WHERE top10PercentileFirstSenior IS NOT NULL + GROUP BY facultyRank + ) x ON x.facultyRank = p.facultyRank + SET p.top10DenominatorFirstSenior = x.count; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10DenominatorFirstSenior', 'INFO', v_rows, NULL); + + -- top10RankFirstSenior + UPDATE analysis_summary_person_new p + JOIN ( + SELECT personIdentifier, + RANK() OVER (PARTITION BY facultyRank ORDER BY top10PercentileFirstSenior DESC) AS personRank + FROM analysis_summary_person_new + WHERE top10PercentileFirstSenior IS NOT NULL ) x ON x.personIdentifier = p.personIdentifier - SET p.top10PercentileAll = x.pct, - p.top10RankAll = x.rank_count, - p.top10DenominatorAll = x.denominator; + SET p.top10RankFirstSenior = x.personRank; + SET v_rows = ROW_COUNT(); + CALL log_progress(v_job_id, v_step, 'Updated top10RankFirstSenior', 'INFO', v_rows, NULL); IF v_error_occurred THEN CALL log_progress(v_job_id, v_step, 'Failed', 'ERROR', NULL, v_error_message); From 6ad492ee2e73bba35dbe452e7beafcf369baec14 Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Tue, 14 Apr 2026 12:39:33 -0400 Subject: [PATCH 04/19] feat(schema): add 4 new Feature Generator fields Capture new top-level and feedbackEvidence fields emitted by ReCiter: - datePublicationAddedToPMC (person_article, analysis_summary_article) - feedbackScoreTextSimilarity (person_article) - feedbackScoreJournalTitleSimilarity (person_article) - feedbackScoreBibliographicCoupling (person_article) Schema, nightly SP (v2 + inline copy), legacy loose-SQL insert, CSV transformer, and LOAD DATA column list updated in lockstep. Adds idempotent migration setup/alter_add_feature_generator_fields_v1.1.sql for applying to prod and dev DBs out-of-band. ALTER must run BEFORE deploying the updated ETL image, otherwise LOAD DATA fails on unknown columns. --- ...lter_add_feature_generator_fields_v1.1.sql | 80 +++++++++++++++++++ setup/createDatabaseTableReciterDb.sql | 5 ++ setup/createEventsProceduresReciterDb.sql | 12 +-- setup/populateAnalysisSummaryTables_v2.sql | 4 +- update/dataTransformer.py | 19 +++-- update/updateReciterDB.py | 4 +- 6 files changed, 112 insertions(+), 12 deletions(-) create mode 100644 setup/alter_add_feature_generator_fields_v1.1.sql diff --git a/setup/alter_add_feature_generator_fields_v1.1.sql b/setup/alter_add_feature_generator_fields_v1.1.sql new file mode 100644 index 0000000..ab5733a --- /dev/null +++ b/setup/alter_add_feature_generator_fields_v1.1.sql @@ -0,0 +1,80 @@ +-- ============================================================================= +-- Migration: Add 4 new Feature Generator fields (v1.1) +-- ============================================================================= +-- Adds columns introduced by ReCiter Feature Generator: +-- - datePublicationAddedToPMC (top-level article field) +-- - feedbackScoreTextSimilarity (evidence.feedbackEvidence) +-- - feedbackScoreJournalTitleSimilarity (evidence.feedbackEvidence) +-- - feedbackScoreBibliographicCoupling (evidence.feedbackEvidence) +-- +-- Safe to run on prod and dev. Uses IF NOT EXISTS-style guards via +-- information_schema check (no-op on re-run). +-- +-- Run BEFORE deploying the updated Python ETL, otherwise LOAD DATA INFILE +-- will fail with "Unknown column" on the 4 new headers. +-- ============================================================================= + +-- ----------------------------------------------------------------------------- +-- person_article: + datePublicationAddedToPMC + 3 feedback scores +-- ----------------------------------------------------------------------------- + +SET @db = DATABASE(); + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'person_article' + AND column_name = 'datePublicationAddedToPMC') = 0, + 'ALTER TABLE person_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL AFTER `datePublicationAddedToEntrez`', + 'SELECT ''person_article.datePublicationAddedToPMC already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'person_article' + AND column_name = 'feedbackScoreTextSimilarity') = 0, + 'ALTER TABLE person_article ADD COLUMN `feedbackScoreTextSimilarity` float DEFAULT NULL AFTER `feedbackScoreYear`', + 'SELECT ''person_article.feedbackScoreTextSimilarity already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'person_article' + AND column_name = 'feedbackScoreJournalTitleSimilarity') = 0, + 'ALTER TABLE person_article ADD COLUMN `feedbackScoreJournalTitleSimilarity` float DEFAULT NULL AFTER `feedbackScoreTextSimilarity`', + 'SELECT ''person_article.feedbackScoreJournalTitleSimilarity already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'person_article' + AND column_name = 'feedbackScoreBibliographicCoupling') = 0, + 'ALTER TABLE person_article ADD COLUMN `feedbackScoreBibliographicCoupling` float DEFAULT NULL AFTER `feedbackScoreJournalTitleSimilarity`', + 'SELECT ''person_article.feedbackScoreBibliographicCoupling already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +-- ----------------------------------------------------------------------------- +-- analysis_summary_article: + datePublicationAddedToPMC +-- (feedback scores NOT carried into summary — per-person-article only) +-- ----------------------------------------------------------------------------- + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'analysis_summary_article' + AND column_name = 'datePublicationAddedToPMC') = 0, + 'ALTER TABLE analysis_summary_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL AFTER `datePublicationAddedToEntrez`', + 'SELECT ''analysis_summary_article.datePublicationAddedToPMC already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +-- ----------------------------------------------------------------------------- +-- Verification +-- ----------------------------------------------------------------------------- + +SELECT table_name, column_name, data_type, is_nullable +FROM information_schema.columns +WHERE table_schema = DATABASE() + AND column_name IN ( + 'datePublicationAddedToPMC', + 'feedbackScoreTextSimilarity', + 'feedbackScoreJournalTitleSimilarity', + 'feedbackScoreBibliographicCoupling') +ORDER BY table_name, ordinal_position; diff --git a/setup/createDatabaseTableReciterDb.sql b/setup/createDatabaseTableReciterDb.sql index 13d04d8..cdedbd7 100644 --- a/setup/createDatabaseTableReciterDb.sql +++ b/setup/createDatabaseTableReciterDb.sql @@ -311,6 +311,7 @@ CREATE TABLE IF NOT EXISTS `analysis_summary_article` ( `publicationDateDisplay` varchar(200) DEFAULT NULL, `publicationDateStandardized` varchar(128) DEFAULT NULL, `datePublicationAddedToEntrez` varchar(128) DEFAULT NULL, + `datePublicationAddedToPMC` varchar(128) DEFAULT NULL, `articleTitle` varchar(1000) DEFAULT NULL, `articleTitleRTF` varchar(2000) DEFAULT NULL, `publicationTypeCanonical` varchar(128) DEFAULT NULL, @@ -654,6 +655,7 @@ CREATE TABLE IF NOT EXISTS `person_article` ( `scopusNonTargetAuthorInstitutionalAffiliationSource` varchar(128) DEFAULT NULL, `scopusNonTargetAuthorInstitutionalAffiliationScore` float DEFAULT 0, `datePublicationAddedToEntrez` varchar(128) DEFAULT NULL, + `datePublicationAddedToPMC` varchar(128) DEFAULT NULL, `doi` varchar(128) DEFAULT NULL, `issn` varchar(128) DEFAULT NULL, `issue` varchar(500) DEFAULT 'NULL', @@ -673,6 +675,9 @@ CREATE TABLE IF NOT EXISTS `person_article` ( `feedbackScoreOrganization` float DEFAULT NULL, `feedbackScoreTargetAuthorName` float DEFAULT NULL, `feedbackScoreYear` float DEFAULT NULL, + `feedbackScoreTextSimilarity` float DEFAULT NULL, + `feedbackScoreJournalTitleSimilarity` float DEFAULT NULL, + `feedbackScoreBibliographicCoupling` float DEFAULT NULL, `totalArticleScoreStandardized` int(11) DEFAULT NULL, `totalArticleScoreNonStandardized` float DEFAULT NULL, `targetAuthorCount` int(11) DEFAULT NULL, diff --git a/setup/createEventsProceduresReciterDb.sql b/setup/createEventsProceduresReciterDb.sql index 7891ea1..533878b 100644 --- a/setup/createEventsProceduresReciterDb.sql +++ b/setup/createEventsProceduresReciterDb.sql @@ -2455,10 +2455,10 @@ order by pmid desc, rank asc; #### 3. Populate "analysis_summary_article" table with articles #### -insert into analysis_summary_article (pmid, pmcid, publicationTypeCanonical, articleYear, publicationDateStandardized, publicationDateDisplay, datePublicationAddedToEntrez, articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, citationCountScopus) -select distinct -pmid, max(pmcid), publicationTypeCanonical, articleYear, min(publicationDateStandardized), publicationDateDisplay, datePublicationAddedToEntrez, articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, max(timesCited) -from person_article +insert into analysis_summary_article (pmid, pmcid, publicationTypeCanonical, articleYear, publicationDateStandardized, publicationDateDisplay, datePublicationAddedToEntrez, datePublicationAddedToPMC, articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, citationCountScopus) +select distinct +pmid, max(pmcid), publicationTypeCanonical, articleYear, min(publicationDateStandardized), publicationDateDisplay, datePublicationAddedToEntrez, max(datePublicationAddedToPMC), articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, max(timesCited) +from person_article where userAssertion = 'ACCEPTED' group by pmid order by datePublicationAddedToEntrez desc; @@ -3584,7 +3584,8 @@ proc_main: BEGIN INSERT INTO analysis_summary_article_new ( pmid, pmcid, publicationTypeCanonical, articleYear, publicationDateStandardized, publicationDateDisplay, - datePublicationAddedToEntrez, articleTitle, journalTitleVerbose, + datePublicationAddedToEntrez, datePublicationAddedToPMC, + articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, citationCountScopus ) SELECT DISTINCT @@ -3595,6 +3596,7 @@ proc_main: BEGIN MIN(publicationDateStandardized), publicationDateDisplay, datePublicationAddedToEntrez, + MAX(datePublicationAddedToPMC), articleTitle, journalTitleVerbose, issn, diff --git a/setup/populateAnalysisSummaryTables_v2.sql b/setup/populateAnalysisSummaryTables_v2.sql index b8807e5..08b3e63 100644 --- a/setup/populateAnalysisSummaryTables_v2.sql +++ b/setup/populateAnalysisSummaryTables_v2.sql @@ -420,7 +420,8 @@ proc_main: BEGIN INSERT INTO analysis_summary_article_new ( pmid, pmcid, publicationTypeCanonical, articleYear, publicationDateStandardized, publicationDateDisplay, - datePublicationAddedToEntrez, articleTitle, journalTitleVerbose, + datePublicationAddedToEntrez, datePublicationAddedToPMC, + articleTitle, journalTitleVerbose, issn, doi, issue, volume, pages, citationCountScopus ) SELECT DISTINCT @@ -431,6 +432,7 @@ proc_main: BEGIN MIN(publicationDateStandardized), publicationDateDisplay, datePublicationAddedToEntrez, + MAX(datePublicationAddedToPMC), articleTitle, journalTitleVerbose, issn, diff --git a/update/dataTransformer.py b/update/dataTransformer.py index e9328b8..f89d502 100644 --- a/update/dataTransformer.py +++ b/update/dataTransformer.py @@ -227,13 +227,15 @@ def process_person_article(items, output_path): "scopusNonTargetAuthorInstitutionalAffiliationSource", "scopusNonTargetAuthorInstitutionalAffiliationScore", - "datePublicationAddedToEntrez", "doi", + "datePublicationAddedToEntrez", "datePublicationAddedToPMC", "doi", "issn", "issue", "journalTitleISOabbreviation", "pages", "timesCited", "volume", - + "feedbackScoreCites", "feedbackScoreCoAuthorName", "feedbackScoreEmail", "feedbackScoreInstitution", "feedbackScoreJournal", "feedbackScoreJournalSubField", "feedbackScoreKeyword", "feedbackScoreOrcid", "feedbackScoreOrcidCoAuthor", "feedbackScoreOrganization", "feedbackScoreTargetAuthorName", "feedbackScoreYear", + "feedbackScoreTextSimilarity", "feedbackScoreJournalTitleSimilarity", + "feedbackScoreBibliographicCoupling", "totalArticleScoreStandardized", "totalArticleScoreNonStandardized" ] @@ -398,6 +400,7 @@ def process_person_article(items, output_path): # Additional fields date_publication_added_to_entrez = sanitize_field(article.get('datePublicationAddedToEntrez', '')) + date_publication_added_to_pmc = sanitize_field(article.get('datePublicationAddedToPMC', '')) doi = sanitize_field(article.get('doi', '')) issn_list = article.get('issn', []) issn = '' @@ -432,11 +435,13 @@ def process_person_article(items, output_path): 'feedbackScoreCites', 'feedbackScoreCoAuthorName', 'feedbackScoreEmail', 'feedbackScoreInstitution', 'feedbackScoreJournal', 'feedbackScoreJournalSubField', 'feedbackScoreKeyword', 'feedbackScoreOrcid', 'feedbackScoreOrcidCoAuthor', - 'feedbackScoreOrganization', 'feedbackScoreTargetAuthorName', 'feedbackScoreYear' + 'feedbackScoreOrganization', 'feedbackScoreTargetAuthorName', 'feedbackScoreYear', + 'feedbackScoreTextSimilarity', 'feedbackScoreJournalTitleSimilarity', + 'feedbackScoreBibliographicCoupling' ] ] else: - feedback_scores = [''] * 12 # Assuming 12 feedback scores + feedback_scores = [''] * 15 # Assuming 15 feedback scores total_article_score_standardized = sanitize_field(article.get('totalArticleScoreStandardized', '')) total_article_score_non_standardized = sanitize_field(article.get('totalArticleScoreNonStandardized', '')) @@ -508,6 +513,7 @@ def process_person_article(items, output_path): "scopusNonTargetAuthorInstitutionalAffiliationScore": scopus_non_target_author_institutional_affiliation_score, "datePublicationAddedToEntrez": date_publication_added_to_entrez, + "datePublicationAddedToPMC": date_publication_added_to_pmc, "doi": doi, "issn": issn, "issue": issue, @@ -529,7 +535,10 @@ def process_person_article(items, output_path): "feedbackScoreOrganization": feedback_scores[9], "feedbackScoreTargetAuthorName": feedback_scores[10], "feedbackScoreYear": feedback_scores[11], - + "feedbackScoreTextSimilarity": feedback_scores[12], + "feedbackScoreJournalTitleSimilarity": feedback_scores[13], + "feedbackScoreBibliographicCoupling": feedback_scores[14], + "totalArticleScoreStandardized": total_article_score_standardized, "totalArticleScoreNonStandardized": total_article_score_non_standardized } diff --git a/update/updateReciterDB.py b/update/updateReciterDB.py index 6aa3a7f..94d0e55 100644 --- a/update/updateReciterDB.py +++ b/update/updateReciterDB.py @@ -303,12 +303,14 @@ def main(truncate_tables=True, skip_person_temp=False): "pubmedTargetAuthorInstitutionalAffiliationMatchTypeScore", "scopusNonTargetAuthorInstitutionalAffiliationSource", "scopusNonTargetAuthorInstitutionalAffiliationScore", - "datePublicationAddedToEntrez", "doi", + "datePublicationAddedToEntrez", "datePublicationAddedToPMC", "doi", "issn", "issue", "journalTitleISOabbreviation", "pages", "timesCited", "volume", "feedbackScoreCites", "feedbackScoreCoAuthorName", "feedbackScoreEmail", "feedbackScoreInstitution", "feedbackScoreJournal", "feedbackScoreJournalSubField", "feedbackScoreKeyword", "feedbackScoreOrcid", "feedbackScoreOrcidCoAuthor", "feedbackScoreOrganization", "feedbackScoreTargetAuthorName", "feedbackScoreYear", + "feedbackScoreTextSimilarity", "feedbackScoreJournalTitleSimilarity", + "feedbackScoreBibliographicCoupling", "totalArticleScoreStandardized", "totalArticleScoreNonStandardized" ], 'person_article_author': [ From 9bd507cfb5b5253c601715d7dac22678ecc76069 Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Tue, 14 Apr 2026 12:49:04 -0400 Subject: [PATCH 05/19] fix(migration): drop AFTER clauses to enable ALGORITHM=INSTANT Applied to prod + dev on MariaDB 11.4 / 10.6 in ~25s (network-bound). AFTER placement forces INPLACE algorithm with metadata lock; appending at end allows INSTANT (no table rewrite, no lock hold). --- setup/alter_add_feature_generator_fields_v1.1.sql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup/alter_add_feature_generator_fields_v1.1.sql b/setup/alter_add_feature_generator_fields_v1.1.sql index ab5733a..fb4c8b7 100644 --- a/setup/alter_add_feature_generator_fields_v1.1.sql +++ b/setup/alter_add_feature_generator_fields_v1.1.sql @@ -24,7 +24,7 @@ SET @sql = (SELECT IF( (SELECT COUNT(*) FROM information_schema.columns WHERE table_schema = @db AND table_name = 'person_article' AND column_name = 'datePublicationAddedToPMC') = 0, - 'ALTER TABLE person_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL AFTER `datePublicationAddedToEntrez`', + 'ALTER TABLE person_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL', 'SELECT ''person_article.datePublicationAddedToPMC already exists''')); PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; @@ -32,7 +32,7 @@ SET @sql = (SELECT IF( (SELECT COUNT(*) FROM information_schema.columns WHERE table_schema = @db AND table_name = 'person_article' AND column_name = 'feedbackScoreTextSimilarity') = 0, - 'ALTER TABLE person_article ADD COLUMN `feedbackScoreTextSimilarity` float DEFAULT NULL AFTER `feedbackScoreYear`', + 'ALTER TABLE person_article ADD COLUMN `feedbackScoreTextSimilarity` float DEFAULT NULL', 'SELECT ''person_article.feedbackScoreTextSimilarity already exists''')); PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; @@ -40,7 +40,7 @@ SET @sql = (SELECT IF( (SELECT COUNT(*) FROM information_schema.columns WHERE table_schema = @db AND table_name = 'person_article' AND column_name = 'feedbackScoreJournalTitleSimilarity') = 0, - 'ALTER TABLE person_article ADD COLUMN `feedbackScoreJournalTitleSimilarity` float DEFAULT NULL AFTER `feedbackScoreTextSimilarity`', + 'ALTER TABLE person_article ADD COLUMN `feedbackScoreJournalTitleSimilarity` float DEFAULT NULL', 'SELECT ''person_article.feedbackScoreJournalTitleSimilarity already exists''')); PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; @@ -48,7 +48,7 @@ SET @sql = (SELECT IF( (SELECT COUNT(*) FROM information_schema.columns WHERE table_schema = @db AND table_name = 'person_article' AND column_name = 'feedbackScoreBibliographicCoupling') = 0, - 'ALTER TABLE person_article ADD COLUMN `feedbackScoreBibliographicCoupling` float DEFAULT NULL AFTER `feedbackScoreJournalTitleSimilarity`', + 'ALTER TABLE person_article ADD COLUMN `feedbackScoreBibliographicCoupling` float DEFAULT NULL', 'SELECT ''person_article.feedbackScoreBibliographicCoupling already exists''')); PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; @@ -61,7 +61,7 @@ SET @sql = (SELECT IF( (SELECT COUNT(*) FROM information_schema.columns WHERE table_schema = @db AND table_name = 'analysis_summary_article' AND column_name = 'datePublicationAddedToPMC') = 0, - 'ALTER TABLE analysis_summary_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL AFTER `datePublicationAddedToEntrez`', + 'ALTER TABLE analysis_summary_article ADD COLUMN `datePublicationAddedToPMC` varchar(128) DEFAULT NULL', 'SELECT ''analysis_summary_article.datePublicationAddedToPMC already exists''')); PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; From 1ad2244bcd0af0eb713a9294d1dce59ed2d43f52 Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Wed, 6 May 2026 13:06:30 -0400 Subject: [PATCH 06/19] fix(SP): fall back to publicationDateStandardized when articleYear is 0 The v2 populateAnalysisSummaryTables SP was selecting articleYear directly from person_article without a fallback, leaving 74% of analysis_summary_article rows with articleYear = 0. The legacy SP had a post-INSERT UPDATE step that derived the year from publicationDateStandardized in those cases, which the v2 rewrite dropped. Push the fallback into the SELECT itself: IF(articleYear != 0, articleYear, LEFT(publicationDateStandardized, 4)) Verified post-deployment: 0 rows with articleYear = 0 in prod and dev. --- setup/createEventsProceduresReciterDb.sql | 2 +- setup/populateAnalysisSummaryTables_v2.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup/createEventsProceduresReciterDb.sql b/setup/createEventsProceduresReciterDb.sql index 533878b..c8a05f5 100644 --- a/setup/createEventsProceduresReciterDb.sql +++ b/setup/createEventsProceduresReciterDb.sql @@ -3592,7 +3592,7 @@ proc_main: BEGIN pmid, MAX(pmcid), publicationTypeCanonical, - articleYear, + IF(articleYear != 0, articleYear, LEFT(publicationDateStandardized, 4)), MIN(publicationDateStandardized), publicationDateDisplay, datePublicationAddedToEntrez, diff --git a/setup/populateAnalysisSummaryTables_v2.sql b/setup/populateAnalysisSummaryTables_v2.sql index 08b3e63..e2a6e3f 100644 --- a/setup/populateAnalysisSummaryTables_v2.sql +++ b/setup/populateAnalysisSummaryTables_v2.sql @@ -428,7 +428,7 @@ proc_main: BEGIN pmid, MAX(pmcid), publicationTypeCanonical, - articleYear, + IF(articleYear != 0, articleYear, LEFT(publicationDateStandardized, 4)), MIN(publicationDateStandardized), publicationDateDisplay, datePublicationAddedToEntrez, From 9c215fd7a304920a639ce6caa2912b3be76d60d6 Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Fri, 15 May 2026 18:43:12 -0400 Subject: [PATCH 07/19] fix(retrieveNIH): correct column inversion in analysis_nih_cites load The iCite API returns two arrays per record: - cited_by: PMIDs that cite the queried article - references: PMIDs the queried article cites The CSV write for cited_by entries used [cited_by_elem, queried_pmid], but the LOAD DATA columns are (cited_pmid, citing_pmid). Since cited_by elements are the CITING articles (not the cited), this stored every cited_by edge with the cited/citing columns swapped. The references loop was already correct. Same inversion affected analysis_nih_cites_clin. Effect: queries like SELECT COUNT(*) FROM analysis_nih_cites WHERE cited_pmid = returned only the rows sourced from other queried PMIDs' references arrays, missing the (typically much larger) cited_by set. Example: PMID 32432483 has iCite citation_count=192 but only 19 rows surfaced because just 19 WCM-tracked papers happened to reference it. After this change, on the next nightly run the table swap will load analysis_nih_cites with semantically correct (cited_pmid, citing_pmid) columns, matching the join used by downstream consumers (Scholars-Profile-System/lib/api/publication-detail.ts). --- update/retrieveNIH.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/update/retrieveNIH.py b/update/retrieveNIH.py index 97b40bd..9b39d72 100644 --- a/update/retrieveNIH.py +++ b/update/retrieveNIH.py @@ -138,22 +138,23 @@ def write_records_to_csv(records, csv_files): nih_writer.writerow(nih_record) nih_count += 1 - citing_pmid = get_dict_value(record, "pmid") + queried_pmid = get_dict_value(record, "pmid") - # Write to analysis_nih_cites + # iCite "cited_by" = articles that cite queried_pmid; "references" = articles queried_pmid cites. + # CSV column order matches LOAD DATA columns: (cited_pmid, citing_pmid). if record.get("cited_by"): - for cited_by in record["cited_by"]: - cites_writer.writerow([cited_by, citing_pmid]) + for citing in record["cited_by"]: + cites_writer.writerow([queried_pmid, citing]) cites_count += 1 if record.get("references"): - for ref in record["references"]: - cites_writer.writerow([ref, citing_pmid]) + for cited in record["references"]: + cites_writer.writerow([cited, queried_pmid]) cites_count += 1 # Write to analysis_nih_cites_clin if record.get("cited_by_clin"): - for cited_by_clin in record["cited_by_clin"]: - cites_clin_writer.writerow([cited_by_clin, citing_pmid]) + for citing_clin in record["cited_by_clin"]: + cites_clin_writer.writerow([queried_pmid, citing_clin]) cites_clin_count +=1 except Exception as e: From 810bc177e5125b89f2d7e16fdd20a6ebef68491a Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Sat, 16 May 2026 19:43:46 -0400 Subject: [PATCH 08/19] fix(abstractImport): replace CSV bulk-load that silently dropped rows abstractImport.py wrote abstracts to a CSV with csv.writer, then bulk-loaded it with LOAD DATA LOCAL INFILE. The two are incompatible: csv.writer emits \r\n line endings and doubled-quote escaping (""), while the LOAD used LINES TERMINATED BY '\n' and ENCLOSED BY '"' with backslash escaping. Any abstract containing a double quote desynced MySQL's row parser, so ~99.9% of every file was dropped with no error raised (~43 of ~72,000 rows loaded). The unresolved PMIDs were re-selected every cycle and the unbounded while-True loop ran until the 15,000s pipeline timeout, restart-looping indefinitely and blocking the nightly CronJob. - Replace CSV + LOAD DATA with a parameterized, batched executemany INSERT so abstract text is stored verbatim regardless of content. - Bound the fetch/insert loop: stop on no progress, cap at 25 cycles, so unresolvable PMIDs can no longer hang the pipeline. - Retry batch_get_item UnprocessedKeys with backoff instead of dropping throttled keys. - Connect with charset=utf8mb4; drop the now-unused local_infile flag. - Add a --dry-run mode that verifies the fetch/insert path against a TEMPORARY table without touching reporting_abstracts. --- update/abstractImport.py | 338 +++++++++++++++++++++++++++------------ 1 file changed, 236 insertions(+), 102 deletions(-) diff --git a/update/abstractImport.py b/update/abstractImport.py index 867b538..8057b24 100644 --- a/update/abstractImport.py +++ b/update/abstractImport.py @@ -1,10 +1,10 @@ # abstractImport.py import boto3 -import csv import logging import pymysql.cursors import pymysql.err +import random import sys import time import os @@ -20,6 +20,10 @@ ) logger = logging.getLogger(__name__) +# Quiet botocore's per-call credential/endpoint chatter so pipeline logs stay readable. +logging.getLogger("botocore").setLevel(logging.WARNING) +logging.getLogger("boto3").setLevel(logging.WARNING) + # ------------------------------------------------------------------------------ # Environment Variables # ------------------------------------------------------------------------------ @@ -28,15 +32,31 @@ DB_HOST = os.getenv("DB_HOST") DB_NAME = os.getenv("DB_NAME") -# DynamoDB concurrency settings -CHUNK_SIZE = 100 # Max items per batch_get_item call -MAX_WORKERS = 5 # Number of threads for parallel fetching +# ------------------------------------------------------------------------------ +# Settings +# ------------------------------------------------------------------------------ +# DynamoDB fetch +CHUNK_SIZE = 100 # Max keys per batch_get_item call (DynamoDB hard limit) +MAX_WORKERS = 5 # Threads for parallel fetching +MAX_UNPROCESSED_RETRIES = 8 # Backoff retries for keys DynamoDB reports as unprocessed + +# Insert +INSERT_BATCH_SIZE = 200 # Rows per executemany batch (kept well under max_allowed_packet) + +# Loop safety +MAX_CYCLES = 25 # Hard cap on fetch/insert cycles; a healthy run needs 1-2 + +# Dry run +DRY_RUN = "--dry-run" in sys.argv +DRY_RUN_SAMPLE = 500 # PMIDs processed when --dry-run is passed +DRY_RUN_TABLE = "reporting_abstracts_dryrun" + # ------------------------------------------------------------------------------ # Database Connection # ------------------------------------------------------------------------------ def connect_mysql_server(db_user, db_pass, db_host, db_name): - """Function to connect to MySQL database""" + """Connect to the MariaDB database.""" try: mysql_db = pymysql.connect( user=db_user, @@ -44,7 +64,7 @@ def connect_mysql_server(db_user, db_pass, db_host, db_name): database=db_name, host=db_host, autocommit=True, - local_infile=True, + charset="utf8mb4", cursorclass=pymysql.cursors.DictCursor ) logger.info(f"Connected to database server: {db_host}, database: {db_name}, user: {db_user}") @@ -53,13 +73,14 @@ def connect_mysql_server(db_user, db_pass, db_host, db_name): logger.error(f"{time.ctime()} -- Error connecting to the database: {err}") sys.exit(1) + # ------------------------------------------------------------------------------ # Fetch All Missing PMIDs # ------------------------------------------------------------------------------ def fetch_missing_pmids(mysql_conn): """ - Returns a list of all PMIDs that exist in analysis_summary_article - but do NOT exist in reporting_abstracts. + Returns every PMID that exists in analysis_summary_article but has no + matching row in reporting_abstracts. """ sql = """ SELECT DISTINCT p.pmid AS pmid @@ -69,16 +90,17 @@ def fetch_missing_pmids(mysql_conn): """ with mysql_conn.cursor() as cursor: cursor.execute(sql) - rows = cursor.fetchall() - return [row["pmid"] for row in rows] + return [row["pmid"] for row in cursor.fetchall()] + # ------------------------------------------------------------------------------ # Extract Abstract Text # ------------------------------------------------------------------------------ def get_abstract(item): """ - Extracts the abstract text from a DynamoDB item representing a PubMed article. - Handles labeled abstract segments if present. + Extracts the abstract text from a DynamoDB item representing a PubMed + article. Handles labeled abstract segments. Returns "" when no abstract + is present. """ medline_citation = item.get("pubmedarticle", {}).get("medlinecitation") if not medline_citation: @@ -102,125 +124,237 @@ def get_abstract(item): return " ".join(abstract_texts) if abstract_texts else "" + # ------------------------------------------------------------------------------ -# Batch Fetch Abstracts from DynamoDB +# Fetch Abstracts from DynamoDB # ------------------------------------------------------------------------------ def fetch_abstracts_for_chunk(chunk_pmids): """ - Performs a single batch_get_item call for the given chunk of PMIDs. - Returns a list of (pmid, abstract_text) pairs. + Fetches one chunk of PMIDs from DynamoDB via batch_get_item. Any keys that + DynamoDB reports as unprocessed (throttling) are retried with exponential + backoff so they are not silently lost. Returns (pmid, abstract) pairs. """ - dynamodb = boto3.resource("dynamodb") - client = dynamodb.meta.client + client = boto3.resource("dynamodb").meta.client + + request_keys = [{"pmid": pmid} for pmid in chunk_pmids] + results = [] + attempt = 0 - # Prepare Keys for batch_get_item - keys = [{"pmid": pmid} for pmid in chunk_pmids] + while request_keys: + response = client.batch_get_item( + RequestItems={"PubMedArticle": {"Keys": request_keys}} + ) - # Perform batch_get_item - response = client.batch_get_item( - RequestItems={ - "PubMedArticle": {"Keys": keys} - } - ) + for item in response["Responses"].get("PubMedArticle", []): + pmid = item.get("pmid") + if pmid is not None: + results.append((pmid, get_abstract(item))) - items = response["Responses"].get("PubMedArticle", []) - results = [] - for item in items: - pmid = item.get("pmid") - if pmid: - abstract_text = get_abstract(item) - results.append((pmid, abstract_text)) + request_keys = ( + response.get("UnprocessedKeys", {}) + .get("PubMedArticle", {}) + .get("Keys", []) + ) + if request_keys: + attempt += 1 + if attempt > MAX_UNPROCESSED_RETRIES: + logger.warning( + f"{len(request_keys)} key(s) still unprocessed after " + f"{MAX_UNPROCESSED_RETRIES} retries; skipping this chunk's remainder." + ) + break + time.sleep(min(0.1 * (2 ** attempt), 5.0)) return results + +def fetch_all_abstracts(pmids): + """Fetches abstracts for all given PMIDs from DynamoDB in parallel.""" + chunks = [pmids[i:i + CHUNK_SIZE] for i in range(0, len(pmids), CHUNK_SIZE)] + logger.info(f"Created {len(chunks)} chunk(s). Each chunk up to {CHUNK_SIZE} PMIDs.") + + all_results = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: + futures = {executor.submit(fetch_abstracts_for_chunk, c): c for c in chunks} + for future in concurrent.futures.as_completed(futures): + try: + all_results.extend(future.result()) + except Exception as e: + logger.exception(f"Error fetching chunk: {e}") + return all_results + + # ------------------------------------------------------------------------------ -# Bulk-Load a Single CSV into reporting_abstracts +# Insert Abstracts # ------------------------------------------------------------------------------ -def load_csv_into_reporting_abstracts(mysql_conn, csv_path): +def insert_abstracts(mysql_conn, results, target_table="reporting_abstracts"): + """ + Inserts (pmid, abstract) pairs with a parameterized, batched INSERT. + + pymysql binds every value as a query parameter, so abstracts containing + double quotes, tabs, newlines or backslashes are stored verbatim. The + previous CSV + LOAD DATA INFILE path could not parse such content and + silently dropped the affected rows. + """ + if not results: + logger.info("No abstracts to insert.") + return 0 + + insert_sql = f"INSERT INTO {target_table} (pmid, abstract) VALUES (%s, %s)" + inserted = 0 with mysql_conn.cursor() as cursor: - cwd = os.getcwd() - full_csv_path = os.path.join(cwd, csv_path).replace("\\", "/") # Ensure correct path format - - load_query = ( - "LOAD DATA LOCAL INFILE '{path}' " - "INTO TABLE reporting_abstracts " - "FIELDS TERMINATED BY '\t' ENCLOSED BY '\"' " - "LINES TERMINATED BY '\n' " - "IGNORE 1 LINES (pmid, abstract);" - ).format(path=full_csv_path) - - cursor.execute(load_query) - logger.info(f"{time.ctime()} -- {csv_path} loaded into reporting_abstracts.") - - update_query = ( - "UPDATE reporting_abstracts " - "SET abstractVarchar = CAST(abstract AS CHAR(15000)) " - "WHERE abstractVarchar IS NULL;" + for i in range(0, len(results), INSERT_BATCH_SIZE): + batch = results[i:i + INSERT_BATCH_SIZE] + cursor.executemany(insert_sql, batch) + inserted += len(batch) + logger.info(f"{time.ctime()} -- Inserted {inserted} row(s) into {target_table}.") + + cursor.execute( + f"UPDATE {target_table} " + f"SET abstractVarchar = CAST(abstract AS CHAR(15000)) " + f"WHERE abstractVarchar IS NULL" ) - cursor.execute(update_query) - logger.info(f"{time.ctime()} -- reporting_abstracts updated with varchar equivalents.") + logger.info(f"{time.ctime()} -- {target_table} updated with varchar equivalents.") + return inserted + # ------------------------------------------------------------------------------ -# Main Script Logic +# Dry Run +# ------------------------------------------------------------------------------ +def run_dry_run(mysql_conn): + """ + Verifies the fetch -> insert path end to end without modifying + reporting_abstracts: a random sample of missing PMIDs is processed into a + session-private TEMPORARY table, then verified and discarded. + """ + logger.info("=== DRY RUN === reporting_abstracts will NOT be modified.") + + all_pmids = fetch_missing_pmids(mysql_conn) + logger.info(f"{len(all_pmids)} PMID(s) currently missing abstracts in production.") + if not all_pmids: + logger.info("Nothing missing; no sample to process.") + mysql_conn.close() + return + + sample = random.sample(all_pmids, min(DRY_RUN_SAMPLE, len(all_pmids))) + logger.info(f"Processing a random sample of {len(sample)} PMID(s) through the new insert path.") + + try: + with mysql_conn.cursor() as cursor: + cursor.execute(f"CREATE TEMPORARY TABLE {DRY_RUN_TABLE} LIKE reporting_abstracts") + + all_results = fetch_all_abstracts(sample) + logger.info(f"Fetched {len(all_results)} item(s) from DynamoDB (requested {len(sample)}).") + if not all_results: + logger.error("DRY RUN FAILED: DynamoDB returned nothing for the sample.") + return + + poison = [ + (p, a) for p, a in all_results + if a and any(c in a for c in ('"', '\t', '\n', '\r', '\\')) + ] + logger.info( + f"{len(poison)} of {len(all_results)} fetched abstracts contain " + f"quotes/tabs/newlines/backslashes -- the content the old LOAD DATA " + f"path silently dropped." + ) + if poison: + logger.info(f"Example poison abstract (PMID {poison[0][0]}): {poison[0][1][:160]!r}") + + inserted = insert_abstracts(mysql_conn, all_results, target_table=DRY_RUN_TABLE) + + with mysql_conn.cursor() as cursor: + cursor.execute( + f"SELECT COUNT(*) c, COUNT(DISTINCT pmid) d, " + f"SUM(pmid IS NULL) nullp, SUM(abstractVarchar IS NULL) nullv " + f"FROM {DRY_RUN_TABLE}" + ) + stats = cursor.fetchone() + + counts_ok = ( + stats["c"] == len(all_results) + and not stats["nullp"] + and not stats["nullv"] + ) + + # Content-integrity check: re-read the longest poison abstract verbatim. + integrity_ok = True + if poison: + worst_pmid, worst_abs = max(poison, key=lambda x: len(x[1])) + with mysql_conn.cursor() as cursor: + cursor.execute( + f"SELECT abstract FROM {DRY_RUN_TABLE} WHERE pmid = %s", (worst_pmid,) + ) + stored = cursor.fetchone()["abstract"] + if isinstance(stored, bytes): + stored = stored.decode("utf-8") + integrity_ok = (stored == worst_abs) + logger.info( + f"Content-integrity check on PMID {worst_pmid} " + f"({len(worst_abs)} chars, contains poison characters): " + f"{'MATCH' if integrity_ok else 'MISMATCH'}" + ) + + if counts_ok and integrity_ok: + logger.info( + f"DRY RUN PASSED -- {inserted} row(s) inserted; {stats['c']} present; " + f"{stats['d']} distinct PMIDs; 0 NULL pmids; 0 NULL abstractVarchar; " + f"content stored verbatim." + ) + else: + logger.error( + f"DRY RUN FAILED -- rows={stats['c']} (expected {len(all_results)}); " + f"null_pmid={stats['nullp']}; null_varchar={stats['nullv']}; " + f"integrity_ok={integrity_ok}" + ) + finally: + with mysql_conn.cursor() as cursor: + cursor.execute(f"DROP TEMPORARY TABLE IF EXISTS {DRY_RUN_TABLE}") + logger.info(f"Scratch table {DRY_RUN_TABLE} dropped.") + mysql_conn.close() + + +# ------------------------------------------------------------------------------ +# Main # ------------------------------------------------------------------------------ def main(): - # 1) Connect to MySQL mysql_conn = connect_mysql_server(DB_USERNAME, DB_PASSWORD, DB_HOST, DB_NAME) - while True: - # 2) Fetch all missing PMIDs + if DRY_RUN: + run_dry_run(mysql_conn) + return + + prev_missing = None + for cycle in range(1, MAX_CYCLES + 1): all_pmids = fetch_missing_pmids(mysql_conn) if not all_pmids: logger.info("No more missing abstracts. We are done.") break - logger.info(f"Found {len(all_pmids)} PMIDs needing abstracts.") - - # 3) Remove any existing abstract.csv - csv_path = "abstract.csv" - if os.path.exists(csv_path): - os.remove(csv_path) + logger.info(f"Cycle {cycle}: found {len(all_pmids)} PMID(s) needing abstracts.") - # 4) Chunk the PMIDs - chunks = [ - all_pmids[i : i + CHUNK_SIZE] - for i in range(0, len(all_pmids), CHUNK_SIZE) - ] - logger.info(f"Created {len(chunks)} chunk(s). Each chunk up to {CHUNK_SIZE} PMIDs.") - - # Accumulate all results in memory for this iteration - all_results = [] - - # 5) Parallel fetch from DynamoDB - with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor: - future_to_chunk = { - executor.submit(fetch_abstracts_for_chunk, c): c for c in chunks - } - for future in concurrent.futures.as_completed(future_to_chunk): - try: - chunk_result = future.result() - all_results.extend(chunk_result) - except Exception as e: - logger.exception(f"Error fetching chunk: {e}") - - logger.info(f"Fetched abstracts for {len(all_results)} PMIDs in this cycle.") - - # 6) Write to CSV - with open(csv_path, "w", newline="", encoding="utf-8") as f: - writer = csv.writer(f, delimiter="\t") - writer.writerow(["pmid", "abstract"]) - for pmid, abstract_text in all_results: - writer.writerow([pmid, abstract_text]) - - # 7) Load CSV into DB - load_csv_into_reporting_abstracts(mysql_conn, csv_path) - - # We then loop again in case there are additional PMIDs that - # appeared or newly became missing. Usually, you won't see more, - # but if your data is updated behind the scenes, it handles that too. + # Safety net: if a cycle does not reduce the missing count, the + # remaining PMIDs cannot be resolved (no DynamoDB record). Stop rather + # than loop forever -- the failure mode that hung the nightly pipeline. + if prev_missing is not None and len(all_pmids) >= prev_missing: + logger.warning( + f"No progress since the previous cycle ({len(all_pmids)} PMID(s) " + f"still missing); stopping. These PMIDs have no retrievable abstract." + ) + break + prev_missing = len(all_pmids) + + all_results = fetch_all_abstracts(all_pmids) + logger.info(f"Fetched abstracts for {len(all_results)} PMID(s) from DynamoDB.") + insert_abstracts(mysql_conn, all_results) + else: + logger.warning( + f"Reached the {MAX_CYCLES}-cycle safety limit with abstracts still " + f"missing; stopping. A healthy run converges in 1-2 cycles -- investigate." + ) mysql_conn.close() - logger.info("All missing abstracts have now been imported.") + logger.info("Abstract import complete.") if __name__ == "__main__": From a97b535876afd1dd955f4f0632cebc875b39248a Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Sat, 9 May 2026 08:54:39 -0400 Subject: [PATCH 09/19] feat(reporter): NIH RePORTER ETL with provenance reconciliation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new nightly ETL step (retrieveReporter.py) that pulls grant metadata and pub-grant linkages from NIH RePORTER (api.reporter.nih.gov/v2) and reconciles them against the existing PubMed-derived person_article_grant table. Three new tables (alter_add_reporter_fields_v1.2.sql): - grant_reporter_project — RePORTER /projects/search results, refreshed each cycle (truncate-reload). Includes abstract_text for cross-reference. - grant_reporter_link — RePORTER /publications/search (pmid, appl_id) pairs. - grant_provenance — long-lived per-(person, pmid, grant) audit log with source_reporter, source_reciterdb, *_first_seen, last_verified. Survives the nightly truncate-reload of person_article_grant. Keyed by (personIdentifier, pmid, core_project_num) where core_project_num is the normalized NIH grant identifier (e.g. R01DK127777). ETL strategy: - Projects: filtered by org_names=["WEILL MEDICAL COLL OF CORNELL UNIV"], partitioned by fiscal year to stay under the 9,999 offset cap (WCM has ~15K projects historically). 1 req/sec rate limit honored. - Publications: keyed by appl_ids from projects, batched. - Reconciliation: bulk INSERT...SELECT ON DUPLICATE KEY UPDATE on both sides. Reciterdb side stages normalized grant strings via temp table + LOAD DATA LOCAL INFILE; RePORTER side joins grant_reporter_link to person_article (userAssertion='ACCEPTED') as the false-positive guard. - Subaward caveat: WCM-as-subaward will be missed by the org filter (false-negative tradeoff accepted to keep false positives near zero). Validated on dev: 235K provenance rows from 33K reciterdb + 132K RePORTER inputs. End-to-end ~18 min (15 min publications fetch is the rate-limited floor; reconciliation completes in ~10 sec via bulk SQL). --- setup/alter_add_reporter_fields_v1.2.sql | 127 +++++++ update/retrieveReporter.py | 429 +++++++++++++++++++++++ update/run_all.py | 3 +- 3 files changed, 558 insertions(+), 1 deletion(-) create mode 100644 setup/alter_add_reporter_fields_v1.2.sql create mode 100644 update/retrieveReporter.py diff --git a/setup/alter_add_reporter_fields_v1.2.sql b/setup/alter_add_reporter_fields_v1.2.sql new file mode 100644 index 0000000..1520eb5 --- /dev/null +++ b/setup/alter_add_reporter_fields_v1.2.sql @@ -0,0 +1,127 @@ +-- ============================================================================= +-- Migration: NIH RePORTER integration (v1.2) +-- ============================================================================= +-- Adds the tables needed to ingest pub-grant linkages and project metadata +-- from NIH RePORTER (https://api.reporter.nih.gov/v2/) and to track per-pair +-- provenance over time. +-- +-- WHY SEPARATE TABLES (not columns on person_article_grant): +-- person_article_grant is TRUNCATE-reloaded by updateReciterDB.py every +-- night from ReCiter scoring output (see updateReciterDB.py:241). Any +-- provenance columns added directly to that table would be wiped on each +-- nightly run, defeating the purpose of *_first_seen tracking. The +-- provenance table below is updated incrementally and survives reloads. +-- +-- WHAT'S CREATED: +-- 1. grant_reporter_project — RePORTER /projects/search results +-- 2. grant_reporter_link — RePORTER /publications/search results +-- 3. grant_provenance — long-lived per-(person, pmid, grant) +-- source-and-timestamp log +-- +-- Safe to run on prod and dev. Idempotent (CREATE TABLE IF NOT EXISTS). +-- Run BEFORE deploying retrieveReporter.py. +-- ============================================================================= + +-- ----------------------------------------------------------------------------- +-- grant_reporter_project — RePORTER project metadata +-- ----------------------------------------------------------------------------- +-- One row per RePORTER appl_id returned by /projects/search for the configured +-- WCM org filter. Refreshed each ETL cycle (truncate-reload OK; no historical +-- state to preserve here — RePORTER is the source of truth). +-- +-- abstract_text is stored here as a cross-reference. The Funding UI reads +-- abstracts from Postgres (Scholars-Profile-System) where they're joined to +-- InfoEd grant rows; this column exists for ad-hoc analysis and future +-- reciterdb-side consumers. +-- ----------------------------------------------------------------------------- + +CREATE TABLE IF NOT EXISTS `grant_reporter_project` ( + `appl_id` int(11) NOT NULL, + `core_project_num` varchar(32) DEFAULT NULL, + `project_title` varchar(512) DEFAULT NULL, + `org_name` varchar(255) DEFAULT NULL, + `fiscal_year` smallint(6) DEFAULT NULL, + `activity_code` varchar(8) DEFAULT NULL, + `project_start_date` date DEFAULT NULL, + `project_end_date` date DEFAULT NULL, + `abstract_text` mediumtext DEFAULT NULL, + `last_fetched_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (`appl_id`), + KEY `core_project_num` (`core_project_num`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- ----------------------------------------------------------------------------- +-- grant_reporter_link — RePORTER pub-grant linkages +-- ----------------------------------------------------------------------------- +-- One row per (pmid, appl_id) pair returned by /publications/search. +-- Refreshed each ETL cycle (truncate-reload). The grant_provenance table +-- below is what carries history. +-- ----------------------------------------------------------------------------- + +CREATE TABLE IF NOT EXISTS `grant_reporter_link` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `pmid` int(11) NOT NULL, + `appl_id` int(11) NOT NULL, + `core_project_num` varchar(32) DEFAULT NULL, + `last_fetched_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (`id`), + UNIQUE KEY `uk_pmid_appl_id` (`pmid`, `appl_id`), + KEY `pmid` (`pmid`), + KEY `core_project_num` (`core_project_num`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- ----------------------------------------------------------------------------- +-- grant_provenance — per-(person, pmid, grant) source and timestamp log +-- ----------------------------------------------------------------------------- +-- The audit log that survives nightly truncate-reload of person_article_grant. +-- Keyed by (personIdentifier, pmid, core_project_num) where core_project_num +-- is the normalized NIH grant identifier (e.g. "R01DK127777" — no year suffix, +-- no spaces). For non-NIH grants the original articleGrant string is stored +-- in core_project_num as a fallback so the row is still keyable. +-- +-- Update logic (run nightly by retrieveReporter.py after person_article_grant +-- has been refreshed by retrieveArticles.py): +-- +-- 1. UPSERT from person_article_grant: any (personIdentifier, pmid, +-- normalized_grant) currently in person_article_grant gets +-- source_reciterdb=1 and last_verified=NOW(). reciterdb_first_seen is +-- set on first insert and never overwritten. +-- +-- 2. UPSERT from grant_reporter_link joined to person_article (where +-- userAssertion='ACCEPTED' to scope to confirmed WCM authors): any +-- (personIdentifier, pmid, core_project_num) seen in RePORTER gets +-- source_reporter=1 and last_verified=NOW(). reporter_first_seen is +-- set on first insert and never overwritten. +-- +-- Subaward caution: see retrieveReporter.py — we filter RePORTER projects +-- to org_names=["WEILL MEDICAL COLL OF CORNELL UNIV"] and join PMIDs to +-- person_article ACCEPTED rows. This minimizes false positives at the cost +-- of missing some legitimate WCM-as-subaward linkages. +-- ----------------------------------------------------------------------------- + +CREATE TABLE IF NOT EXISTS `grant_provenance` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `personIdentifier` varchar(128) NOT NULL, + `pmid` int(11) NOT NULL, + `core_project_num` varchar(64) NOT NULL, + `appl_id` int(11) DEFAULT NULL, + `source_reporter` tinyint(1) NOT NULL DEFAULT 0, + `source_reciterdb` tinyint(1) NOT NULL DEFAULT 0, + `reporter_first_seen` datetime DEFAULT NULL, + `reciterdb_first_seen` datetime DEFAULT NULL, + `last_verified` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (`id`), + UNIQUE KEY `uk_person_pmid_grant` (`personIdentifier`, `pmid`, `core_project_num`), + KEY `pmid` (`pmid`), + KEY `appl_id` (`appl_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- ----------------------------------------------------------------------------- +-- Verification +-- ----------------------------------------------------------------------------- + +SELECT table_name, table_rows, create_time +FROM information_schema.tables +WHERE table_schema = DATABASE() + AND table_name IN ('grant_reporter_project', 'grant_reporter_link', 'grant_provenance') +ORDER BY table_name; diff --git a/update/retrieveReporter.py b/update/retrieveReporter.py new file mode 100644 index 0000000..7d938d7 --- /dev/null +++ b/update/retrieveReporter.py @@ -0,0 +1,429 @@ +# retrieveReporter.py +# +# Pulls grant metadata and pub-grant linkages from NIH RePORTER +# (https://api.reporter.nih.gov/v2/) and reconciles them against the +# ReCiter-derived person_article_grant table. +# +# Two API loops: +# 1. POST /projects/search filtered by WCM org name → grant_reporter_project +# 2. POST /publications/search keyed by appl_ids from step 1 → grant_reporter_link +# +# Then a SQL reconciliation step populates grant_provenance, the long-lived +# (person, pmid, grant)-keyed audit log that survives the nightly truncate- +# reload of person_article_grant. See setup/alter_add_reporter_fields_v1.2.sql +# for the full design rationale. +# +# Why we filter by org_name rather than fetching everything: +# RePORTER returns ~thousands of WCM-attributed projects. Pulling the full +# corpus would require partitioning by FY (offset cap is 9,999) and gives +# no benefit for our use case. Subaward caveat: WCM-as-sub may not appear +# under this org filter — accepted as a false-negative tradeoff to keep +# false positives near zero. + +import os +import sys +import csv +import time +import random +import re +import logging +import faulthandler +import signal +import requests +import pymysql.cursors +import pymysql.err + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('retrieveReporter.log', mode='w'), + logging.StreamHandler(sys.stdout), + ], +) +logger = logging.getLogger(__name__) + +faulthandler.enable(file=sys.stderr, all_threads=True) +faulthandler.register(signal.SIGUSR1, file=sys.stderr, all_threads=True) + +REPORTER_BASE_URL = 'https://api.reporter.nih.gov/v2' +WCM_ORG_NAME = 'WEILL MEDICAL COLL OF CORNELL UNIV' +PAGE_LIMIT = 500 +OFFSET_CAP = 9999 +REQUEST_INTERVAL_SEC = 1.0 # NIH guidance: 1 req/sec +PUBS_BATCH_SIZE = 50 # appl_ids per /publications/search call + +# core_project_num pattern, e.g. "R01DK127777", "U01AI189285", "K23MH112873". +# Prefix is 1-3 alphanumeric (activity code) + 2 letters (IC) + 5-7 digits. +CORE_PROJECT_RE = re.compile(r'\b([A-Z]\d{1,2}[A-Z]{2}\d{5,7})\b') + + +def connect_db(max_retries=5, backoff_factor=1): + username = os.environ['DB_USERNAME'] + password = os.environ['DB_PASSWORD'] + hostname = os.environ['DB_HOST'] + database = os.environ['DB_NAME'] + for retry in range(max_retries): + try: + conn = pymysql.connect( + user=username, + password=password, + database=database, + host=hostname, + local_infile=True, + cursorclass=pymysql.cursors.DictCursor, + ) + logger.info('Connected to database %s on %s', database, hostname) + return conn + except pymysql.err.MySQLError as err: + logger.error('DB connect attempt %d failed: %s', retry + 1, err) + time.sleep(backoff_factor * (2 ** retry) + random.uniform(0, 1)) + raise RuntimeError('Could not connect to database after retries') + + +def post_with_retry(url, payload, max_retries=5, backoff_factor=1): + """POST with exponential backoff. Honors NIH's 1 req/sec rate limit + by sleeping between successful calls in the caller.""" + for retry in range(max_retries): + try: + r = requests.post(url, json=payload, timeout=(10, 90)) + if r.status_code == 429: + wait = backoff_factor * (2 ** retry) + random.uniform(0, 5) + logger.warning('429 from RePORTER; sleeping %.1fs', wait) + time.sleep(wait) + continue + r.raise_for_status() + return r.json() + except requests.exceptions.RequestException as e: + wait = backoff_factor * (2 ** retry) + random.uniform(0, 1) + logger.error('RePORTER request failed (attempt %d): %s; sleep %.1fs', + retry + 1, e, wait) + time.sleep(wait) + raise RuntimeError(f'RePORTER request failed after {max_retries} retries: {url}') + + +def _fetch_projects_page(criteria): + """Yield project dicts for a single criteria block. Caller must ensure + the result set fits under OFFSET_CAP; we log and stop if it doesn't.""" + url = f'{REPORTER_BASE_URL}/projects/search' + offset = 0 + while offset <= OFFSET_CAP: + payload = { + 'criteria': criteria, + 'limit': PAGE_LIMIT, + 'offset': offset, + } + data = post_with_retry(url, payload) + results = data.get('results', []) or [] + if not results: + return + for row in results: + yield row + meta = data.get('meta', {}) or {} + total = meta.get('total', 0) + offset += PAGE_LIMIT + if offset >= total: + return + if offset > OFFSET_CAP: + logger.warning( + 'Result set has %d records but offset cap is %d; truncating. ' + 'Caller should partition further (e.g. by activity_code).', + total, OFFSET_CAP) + return + time.sleep(REQUEST_INTERVAL_SEC) + + +def fetch_projects(base_criteria): + """Yield project dicts, partitioning by fiscal year when needed to stay + under the offset cap. WCM has ~15K projects historically, which exceeds + the 9,999 offset limit on a single criteria block. + + Strategy: probe total once with the base criteria. If under the cap, + return all in one stream. Otherwise iterate fiscal years from the + earliest NIH grant year (1985) through next year, requesting + fiscal_years=[FY] for each.""" + probe = post_with_retry( + f'{REPORTER_BASE_URL}/projects/search', + {'criteria': base_criteria, 'limit': 1, 'offset': 0}, + ) + total = (probe.get('meta', {}) or {}).get('total', 0) + logger.info('RePORTER /projects/search reports %d total matches for base criteria', total) + + if total <= OFFSET_CAP: + yield from _fetch_projects_page(base_criteria) + return + + import datetime + end_fy = datetime.date.today().year + 1 + for fy in range(1985, end_fy + 1): + criteria = dict(base_criteria) + criteria['fiscal_years'] = [fy] + yielded_this_fy = 0 + for row in _fetch_projects_page(criteria): + yielded_this_fy += 1 + yield row + if yielded_this_fy: + logger.info('FY %d: yielded %d projects', fy, yielded_this_fy) + time.sleep(REQUEST_INTERVAL_SEC) + + +def fetch_publications_for_appl_ids(appl_ids): + """Yield (pmid, appl_id, core_project_num) tuples from /publications/search + in batches of PUBS_BATCH_SIZE.""" + url = f'{REPORTER_BASE_URL}/publications/search' + appl_ids = list({int(x) for x in appl_ids if x is not None}) + for i in range(0, len(appl_ids), PUBS_BATCH_SIZE): + batch = appl_ids[i:i + PUBS_BATCH_SIZE] + offset = 0 + while offset <= OFFSET_CAP: + payload = { + 'criteria': {'appl_ids': batch}, + 'limit': PAGE_LIMIT, + 'offset': offset, + } + data = post_with_retry(url, payload) + results = data.get('results', []) or [] + if not results: + break + for row in results: + pmid = row.get('pmid') + appl_id = row.get('applid') or row.get('appl_id') + core = row.get('coreproject') or row.get('core_project_num') + if pmid and appl_id: + yield int(pmid), int(appl_id), core + meta = data.get('meta', {}) or {} + total = meta.get('total', 0) + offset += PAGE_LIMIT + if offset >= total: + break + time.sleep(REQUEST_INTERVAL_SEC) + time.sleep(REQUEST_INTERVAL_SEC) + + +def reload_table(conn, table, rows, columns): + """Truncate `table` and insert `rows` (list of tuples matching `columns`). + Used for the staging tables grant_reporter_project and grant_reporter_link. + grant_provenance is upserted, not reloaded.""" + placeholders = ', '.join(['%s'] * len(columns)) + col_list = ', '.join(f'`{c}`' for c in columns) + cur = conn.cursor() + cur.execute(f'TRUNCATE TABLE `{table}`') + if rows: + sql = f'INSERT INTO `{table}` ({col_list}) VALUES ({placeholders})' + cur.executemany(sql, rows) + conn.commit() + cur.execute(f'SELECT COUNT(*) AS c FROM `{table}`') + count = cur.fetchone()['c'] + logger.info('Reloaded %s: %d rows', table, count) + + +def normalize_grant_string(raw): + """Extract a core project number (e.g. R01DK127777) from a free-text + NIH grant string. Returns None if no match — caller decides whether to + fall back to the raw string.""" + if not raw: + return None + upper = re.sub(r'[\s\-\/]', '', raw.upper()) + m = CORE_PROJECT_RE.search(upper) + return m.group(1) if m else None + + +def reconcile_provenance(conn): + """Populate grant_provenance from person_article_grant and grant_reporter_link. + + Bulk pattern: each side does a single INSERT...SELECT with ON DUPLICATE + KEY UPDATE so we make one round trip per side instead of one per row. + First_seen timestamps stick because they're only in the INSERT clause, + not the UPDATE clause.""" + import tempfile + cur = conn.cursor() + + # ----- (1) reciterdb side ----- + # Normalization (free-text articleGrant → core_project_num) happens in + # Python, so we stage the normalized rows in a temp table first via + # LOAD DATA LOCAL INFILE, then do a single bulk upsert. + logger.info('Reading person_article_grant for reconciliation') + cur.execute(""" + SELECT personIdentifier, pmid, articleGrant + FROM person_article_grant + WHERE personIdentifier IS NOT NULL + AND pmid > 0 + AND articleGrant IS NOT NULL + AND articleGrant <> '' + """) + pag_rows = cur.fetchall() + logger.info('person_article_grant rows considered: %d', len(pag_rows)) + + # Normalize + dedupe in Python (the temp table's PK enforces uniqueness + # but deduping here avoids LOAD DATA INFILE warnings on duplicate rows). + seen = set() + normalized = [] + for row in pag_rows: + n = normalize_grant_string(row['articleGrant']) + if not n: + # Non-NIH fallback: sanitize control chars (CSV uses TAB delim) + n = re.sub(r'[\t\n\r]', ' ', row['articleGrant'])[:64] + key = (row['personIdentifier'], row['pmid'], n) + if key in seen: + continue + seen.add(key) + normalized.append(key) + logger.info('Normalized + deduped to %d distinct (person, pmid, grant) rows', + len(normalized)) + + csv_file = tempfile.NamedTemporaryFile( + delete=False, mode='w', suffix='.csv', newline='', encoding='utf-8') + try: + writer = csv.writer(csv_file, delimiter='\t', lineterminator='\n', + quoting=csv.QUOTE_NONE, escapechar='\\') + for r in normalized: + writer.writerow(r) + csv_file.close() + + cur.execute("DROP TEMPORARY TABLE IF EXISTS _reciter_grant_staging") + cur.execute(""" + CREATE TEMPORARY TABLE _reciter_grant_staging ( + personIdentifier VARCHAR(128) NOT NULL, + pmid INT NOT NULL, + core_project_num VARCHAR(64) NOT NULL, + PRIMARY KEY (personIdentifier, pmid, core_project_num) + ) ENGINE=InnoDB + """) + load_sql = ( + f"LOAD DATA LOCAL INFILE '{csv_file.name}' " + "INTO TABLE _reciter_grant_staging " + "FIELDS TERMINATED BY '\\t' LINES TERMINATED BY '\\n' " + "(personIdentifier, pmid, core_project_num)" + ) + cur.execute(load_sql) + cur.execute("SELECT COUNT(*) AS c FROM _reciter_grant_staging") + logger.info('Loaded %d rows into reciterdb staging table', + cur.fetchone()['c']) + + cur.execute(""" + INSERT INTO grant_provenance + (personIdentifier, pmid, core_project_num, + source_reciterdb, reciterdb_first_seen, last_verified) + SELECT personIdentifier, pmid, core_project_num, + 1, NOW(), NOW() + FROM _reciter_grant_staging + ON DUPLICATE KEY UPDATE + source_reciterdb = 1, + last_verified = NOW() + """) + # rowcount on bulk upsert is "1 per insert + 2 per update" in MariaDB + # — informative, not exact + logger.info('Reciterdb-side bulk upsert: %d rowcount', cur.rowcount) + cur.execute("DROP TEMPORARY TABLE _reciter_grant_staging") + conn.commit() + finally: + try: + os.unlink(csv_file.name) + except OSError: + pass + + # ----- (2) RePORTER side ----- + # Pure SQL — no Python iteration. The JOIN to person_article enforces + # the false-positive guard (only ACCEPTED PMIDs credit a person). + # GROUP BY collapses cases where one (person, pmid, core_project) has + # multiple appl_ids (different fiscal years of the same grant); MAX + # picks the most recent appl_id deterministically. + logger.info('Running RePORTER-side bulk upsert') + cur.execute(""" + INSERT INTO grant_provenance + (personIdentifier, pmid, core_project_num, appl_id, + source_reporter, reporter_first_seen, last_verified) + SELECT pa.personIdentifier, grl.pmid, grl.core_project_num, + MAX(grl.appl_id), 1, NOW(), NOW() + FROM grant_reporter_link grl + JOIN person_article pa + ON pa.pmid = grl.pmid + AND pa.userAssertion = 'ACCEPTED' + WHERE grl.core_project_num IS NOT NULL + GROUP BY pa.personIdentifier, grl.pmid, grl.core_project_num + ON DUPLICATE KEY UPDATE + source_reporter = 1, + appl_id = COALESCE(VALUES(appl_id), grant_provenance.appl_id), + last_verified = NOW() + """) + logger.info('RePORTER-side bulk upsert: %d rowcount', cur.rowcount) + conn.commit() + + # Summary + cur.execute("SELECT COUNT(*) AS c FROM grant_provenance") + total = cur.fetchone()['c'] + cur.execute("SELECT COUNT(*) AS c FROM grant_provenance WHERE source_reporter = 1 AND source_reciterdb = 1") + both = cur.fetchone()['c'] + cur.execute("SELECT COUNT(*) AS c FROM grant_provenance WHERE source_reporter = 1 AND source_reciterdb = 0") + rep_only = cur.fetchone()['c'] + cur.execute("SELECT COUNT(*) AS c FROM grant_provenance WHERE source_reporter = 0 AND source_reciterdb = 1") + reciter_only = cur.fetchone()['c'] + logger.info('Provenance totals: %d rows | both=%d | reporter-only=%d | reciter-only=%d', + total, both, rep_only, reciter_only) + + +def main(): + org_name = os.environ.get('REPORTER_ORG_NAME', WCM_ORG_NAME) + logger.info('Starting RePORTER ETL for org: %s', org_name) + + conn = connect_db() + + # ----- Loop A: projects ----- + # No include_fields — the API expects CamelCase there ('ApplId') but + # response field names are snake_case ('appl_id'). Easier to take all + # fields back than maintain two name conventions. + project_rows = [] + appl_ids = [] + for proj in fetch_projects(base_criteria={'org_names': [org_name]}): + appl_id = proj.get('appl_id') + if not appl_id: + continue + appl_ids.append(appl_id) + org = (proj.get('organization') or {}).get('org_name') + project_rows.append(( + int(appl_id), + proj.get('core_project_num'), + (proj.get('project_title') or '')[:512], + (org or '')[:255], + proj.get('fiscal_year'), + proj.get('activity_code'), + proj.get('project_start_date'), + proj.get('project_end_date'), + proj.get('abstract_text'), + )) + logger.info('Fetched %d RePORTER projects', len(project_rows)) + reload_table( + conn, + 'grant_reporter_project', + project_rows, + ['appl_id', 'core_project_num', 'project_title', 'org_name', + 'fiscal_year', 'activity_code', 'project_start_date', + 'project_end_date', 'abstract_text'], + ) + + # ----- Loop B: publications ----- + link_rows = [] + seen_pairs = set() + for pmid, appl_id, core in fetch_publications_for_appl_ids(appl_ids): + key = (pmid, appl_id) + if key in seen_pairs: + continue + seen_pairs.add(key) + link_rows.append((pmid, appl_id, core)) + logger.info('Fetched %d unique (pmid, appl_id) pairs', len(link_rows)) + reload_table( + conn, + 'grant_reporter_link', + link_rows, + ['pmid', 'appl_id', 'core_project_num'], + ) + + # ----- Reconciliation ----- + reconcile_provenance(conn) + + conn.close() + logger.info('RePORTER ETL complete') + + +if __name__ == '__main__': + main() diff --git a/update/run_all.py b/update/run_all.py index e53e2fb..6ebe245 100644 --- a/update/run_all.py +++ b/update/run_all.py @@ -112,7 +112,8 @@ def main(): scripts = [ ("executeFeatureGenerator", "python3 executeFeatureGenerator.py"), ("retrieveArticles", "python3 retrieveArticles.py"), - ("retrieveNIH", "python3 retrieveNIH.py"), + ("retrieveNIH", "python3 retrieveNIH.py"), + ("retrieveReporter", "python3 retrieveReporter.py"), ("nightlyIndexing", "bash run_nightly_indexing.sh"), ("abstractImport", "python3 abstractImport.py"), ("conflictsImport", "python3 conflictsImport.py") From 3d88b9221c00a95033619a2c43d11211058776c6 Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Mon, 18 May 2026 00:17:11 -0400 Subject: [PATCH 10/19] feat(reporter): capture NIH RePORTER project terms (#291) RePORTER /projects/search returns NIH-curated keyword vocabularies alongside the abstract. Capture them into grant_reporter_project so the Scholars-Profile-System funding ETL can project them onto grants as a topical search signal (issue #291). - alter_add_reporter_terms_v1.3.sql: ADD COLUMN project_terms, pref_terms via the information_schema-guarded idiom, safe on the live table. - v1.2 CREATE TABLE: mirror the two columns for fresh builds. - retrieveReporter.py: pull `terms` and `pref_terms` from each project dict, stored raw (terms angle-bracket-wrapped, pref_terms semicolon-delimited). --- setup/alter_add_reporter_fields_v1.2.sql | 7 +++ setup/alter_add_reporter_terms_v1.3.sql | 59 ++++++++++++++++++++++++ update/retrieveReporter.py | 7 ++- 3 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 setup/alter_add_reporter_terms_v1.3.sql diff --git a/setup/alter_add_reporter_fields_v1.2.sql b/setup/alter_add_reporter_fields_v1.2.sql index 1520eb5..39c36bb 100644 --- a/setup/alter_add_reporter_fields_v1.2.sql +++ b/setup/alter_add_reporter_fields_v1.2.sql @@ -33,6 +33,11 @@ -- abstracts from Postgres (Scholars-Profile-System) where they're joined to -- InfoEd grant rows; this column exists for ad-hoc analysis and future -- reciterdb-side consumers. +-- +-- project_terms / pref_terms hold the NIH-curated keyword vocabulary RePORTER +-- returns per project, stored raw (project_terms angle-bracket-wrapped, +-- pref_terms semicolon-delimited). Added by alter_add_reporter_terms_v1.3.sql; +-- mirrored into the CREATE TABLE here so a fresh build matches (issue #291). -- ----------------------------------------------------------------------------- CREATE TABLE IF NOT EXISTS `grant_reporter_project` ( @@ -45,6 +50,8 @@ CREATE TABLE IF NOT EXISTS `grant_reporter_project` ( `project_start_date` date DEFAULT NULL, `project_end_date` date DEFAULT NULL, `abstract_text` mediumtext DEFAULT NULL, + `project_terms` text DEFAULT NULL, + `pref_terms` text DEFAULT NULL, `last_fetched_at` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (`appl_id`), KEY `core_project_num` (`core_project_num`) diff --git a/setup/alter_add_reporter_terms_v1.3.sql b/setup/alter_add_reporter_terms_v1.3.sql new file mode 100644 index 0000000..0b4d634 --- /dev/null +++ b/setup/alter_add_reporter_terms_v1.3.sql @@ -0,0 +1,59 @@ +-- ============================================================================= +-- Migration: NIH RePORTER project terms (v1.3) +-- ============================================================================= +-- Adds two columns to grant_reporter_project for the NIH-curated keyword +-- vocabulary RePORTER returns alongside the abstract: +-- - project_terms — RePORTER `terms`, angle-bracket-wrapped () +-- - pref_terms — RePORTER `pref_terms`, semicolon-delimited (a;b;c) +-- +-- Stored raw, verbatim from the API. Parsing into a keyword array happens +-- downstream in the Scholars-Profile-System ETL (issue #291); reciterdb keeps +-- the unparsed strings so a future reciterdb-side consumer can re-parse. +-- +-- WHY AN ALTER, NOT THE CREATE TABLE in v1.2: +-- alter_add_reporter_fields_v1.2.sql creates grant_reporter_project with +-- CREATE TABLE IF NOT EXISTS — a no-op once the table exists, so editing its +-- body would not add columns to a live table. This file uses the +-- information_schema-guarded ALTER idiom (cf. v1.1) so it is safe on a +-- populated prod/dev table. The two columns were also added to v1.2's +-- CREATE TABLE so a fresh build matches. +-- +-- Safe to run on prod and dev. Idempotent (information_schema guard; no-op on +-- re-run). No AFTER clause — keeps ALGORITHM=INSTANT eligible. +-- +-- Run BEFORE deploying the updated retrieveReporter.py, otherwise the project +-- INSERT will fail with "Unknown column" on the 2 new fields. +-- ============================================================================= + +-- ----------------------------------------------------------------------------- +-- grant_reporter_project: + project_terms + pref_terms +-- ----------------------------------------------------------------------------- + +SET @db = DATABASE(); + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'grant_reporter_project' + AND column_name = 'project_terms') = 0, + 'ALTER TABLE grant_reporter_project ADD COLUMN `project_terms` text DEFAULT NULL', + 'SELECT ''grant_reporter_project.project_terms already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'grant_reporter_project' + AND column_name = 'pref_terms') = 0, + 'ALTER TABLE grant_reporter_project ADD COLUMN `pref_terms` text DEFAULT NULL', + 'SELECT ''grant_reporter_project.pref_terms already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +-- ----------------------------------------------------------------------------- +-- Verification +-- ----------------------------------------------------------------------------- + +SELECT table_name, column_name, data_type, is_nullable +FROM information_schema.columns +WHERE table_schema = DATABASE() + AND table_name = 'grant_reporter_project' + AND column_name IN ('project_terms', 'pref_terms') +ORDER BY ordinal_position; diff --git a/update/retrieveReporter.py b/update/retrieveReporter.py index 7d938d7..2088dc6 100644 --- a/update/retrieveReporter.py +++ b/update/retrieveReporter.py @@ -390,6 +390,11 @@ def main(): proj.get('project_start_date'), proj.get('project_end_date'), proj.get('abstract_text'), + # NIH-curated keyword vocabularies, stored raw (issue #291). + # 'terms' is angle-bracket-wrapped (); 'pref_terms' is + # semicolon-delimited. Parsed downstream by the SPS funding ETL. + proj.get('terms'), + proj.get('pref_terms'), )) logger.info('Fetched %d RePORTER projects', len(project_rows)) reload_table( @@ -398,7 +403,7 @@ def main(): project_rows, ['appl_id', 'core_project_num', 'project_title', 'org_name', 'fiscal_year', 'activity_code', 'project_start_date', - 'project_end_date', 'abstract_text'], + 'project_end_date', 'abstract_text', 'project_terms', 'pref_terms'], ) # ----- Loop B: publications ----- From 001c1bd6677ecbb3545e9d49769d83d0882a5606 Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Mon, 18 May 2026 08:53:36 -0400 Subject: [PATCH 11/19] fix(reporter): dedup projects by appl_id before grant_reporter_project reload fetch_projects() partitions the search by fiscal year when the corpus exceeds RePORTER's 9,999 offset cap (WCM has ~15K projects). RePORTER returns a multi-year project under every fiscal year it was active, so the same appl_id comes back in multiple FY pages. appl_id is grant_reporter_project's PRIMARY KEY, so the unguarded reload hit "IntegrityError 1062: Duplicate entry for key 'PRIMARY'" and the run aborted after TRUNCATE had already emptied the table. Dedup the projects loop by appl_id, mirroring the seen_pairs guard the publications loop already uses. --- update/retrieveReporter.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/update/retrieveReporter.py b/update/retrieveReporter.py index 2088dc6..36c8e88 100644 --- a/update/retrieveReporter.py +++ b/update/retrieveReporter.py @@ -374,10 +374,19 @@ def main(): # fields back than maintain two name conventions. project_rows = [] appl_ids = [] + seen_appl_ids = set() for proj in fetch_projects(base_criteria={'org_names': [org_name]}): appl_id = proj.get('appl_id') if not appl_id: continue + # RePORTER returns a project under every fiscal year it was active, so + # the FY-partitioned fetch (used when the corpus exceeds the 9,999 + # offset cap) yields the same appl_id in multiple pages. appl_id is + # grant_reporter_project's PRIMARY KEY, so dedup before the reload — + # mirrors the seen_pairs guard in the publications loop below. + if appl_id in seen_appl_ids: + continue + seen_appl_ids.add(appl_id) appl_ids.append(appl_id) org = (proj.get('organization') or {}).get('org_name') project_rows.append(( From 2ada77f963ec633006b1720bf808228fecd77abf Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Tue, 19 May 2026 07:44:06 -0400 Subject: [PATCH 12/19] fix(docker): add missing COPY for retrieveReporter.py run_all.py invokes retrieveReporter.py, but the Dockerfile's per-file COPY list was never updated when the RePORTER ETL step was added (PR #81), so the built image lacked the script. Every nightly run crashed at step 4 with "can't open file '/usr/src/app/retrieveReporter.py'", halting the pipeline before nightly indexing, abstractImport, and conflictsImport. With restartPolicy: OnFailure this produced an indefinite ~95-min restart loop. --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 8501d0b..f4f344b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,6 +13,7 @@ ENV PYTHONUNBUFFERED=1 # Copy additional Python scripts COPY update/retrieveNIH.py ./ +COPY update/retrieveReporter.py ./ COPY update/retrieveAltmetric.py ./ COPY update/retrieveArticles.py ./ COPY update/updateReciterDB.py ./ From 87d9443c0a9bedd4fc0b3e038ef4726a69b3091b Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Wed, 20 May 2026 17:34:34 -0400 Subject: [PATCH 13/19] fix(reporting_abstracts): repair cross-paper concatenation from old CSV/LOAD DATA path The pre-PR-#78 abstractImport.py wrote rows with csv.writer (RFC-4180 doubled-quote escaping) and LOAD DATA LOCAL INFILE (backslash escaping). On rows whose abstract text contained ", MySQL's parser desynced and consumed multiple TSV rows into one field until the next recoverable delimiter, producing a single reporting_abstracts row attributed to one pmid but containing concatenated text from several other papers in the same DynamoDB batch. The May 16 fix (PR #78) stopped new corruption but did not repair existing rows. fetch_missing_pmids() uses LEFT JOIN ... WHERE a.pmid IS NULL, so any pmid with a (corrupted) row was permanently skipped. Audit against prod found 3,124 corrupted rows out of 391,238 total. 84% are pegged at the 65,535-byte BLOB cap with the original pmid's abstract at the head and text from several unrelated papers at the tail. 181 additional rows are clean-pair duplicates from the same era (no UNIQUE constraint on pmid; concurrent old-import runs produced identical content twice). Verification against DynamoDB also spared 464 legitimately long abstracts (structured / consensus / multi-arm trials, lengths up to ~32K) that a length-only filter would have wrongly purged. This change ships the tooling and schema for the cleanup: - update/auditAbstracts.py -- read-only forensic audit. Compares reporting_abstracts.abstract against DynamoDB.PubMedArticle (the same source abstractImport.py uses) and classifies each row >= 4000 chars as CLEAN, PREFIX_CORRUPTED, DISJOINT, EMPTY_IN_DYNAMO, or MISSING_IN_DYNAMO. Writes audit_abstracts.csv plus a dump of the worst offenders. - update/repairAbstracts.py -- destructive cleanup. Backs up affected rows to a timestamped table, deletes corrupted rows in batches, then dedupes any remaining pmid duplicates by keeping MIN(id). Requires --apply; default is dry-run. Confirms the v1.4 migration precondition (no duplicates) after running. - setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql -- adds UNIQUE KEY on reporting_abstracts.pmid. information_schema-guarded; aborts if any duplicates remain. Mirrors the analysis_nih fix from PR #71 after the Dec 2025 duplicate-loading incident. - setup/createDatabaseTableReciterDb.sql -- idx_pmid is now UNIQUE for fresh installs. - .gitignore -- excludes audit / repair artifacts at the repo root (they contain prod abstract text). DBA runbook after merge: 1. python3 update/auditAbstracts.py 2. python3 update/repairAbstracts.py (dry-run; review counts) 3. python3 update/repairAbstracts.py --apply 4. mysql ... < setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql 5. Next nightly abstractImport.py backfills the deleted PMIDs via the parameterized path. Refs: #87, PR #78. --- .gitignore | 8 + ...r_add_uq_pmid_reporting_abstracts_v1.4.sql | 84 +++++ setup/createDatabaseTableReciterDb.sql | 2 +- update/auditAbstracts.py | 318 ++++++++++++++++ update/repairAbstracts.py | 339 ++++++++++++++++++ 5 files changed, 750 insertions(+), 1 deletion(-) create mode 100644 setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql create mode 100644 update/auditAbstracts.py create mode 100644 update/repairAbstracts.py diff --git a/.gitignore b/.gitignore index 0f6ba4c..b2dd5b2 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,14 @@ update/*.log update/app.log update/retrieveNIH.log update/temp/ +retrieveNIH.log + +# One-shot audit / repair artifacts (contain prod abstract text; never commit) +audit_abstracts.csv +audit_abstracts_dump.txt +invalid_pmids.txt +invalid_pmids.sql +reporting_abstracts_corrupt_backup_*.sql # Legacy ML models (unused) update/*.keras diff --git a/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql b/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql new file mode 100644 index 0000000..a6e3211 --- /dev/null +++ b/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql @@ -0,0 +1,84 @@ +-- ============================================================================= +-- Migration: UNIQUE KEY on reporting_abstracts.pmid (v1.4) +-- ============================================================================= +-- Replaces the existing non-unique `idx_pmid` index on reporting_abstracts +-- with a UNIQUE KEY so the parser-desync class of failure that corrupted +-- ~3,100 rows historically (issue #87, pre-PR #78 CSV / LOAD DATA path) can +-- no longer silently produce duplicate-pmid rows. +-- +-- WHY THIS IS NEEDED: +-- update/abstractImport.py's fetch_missing_pmids() uses +-- LEFT JOIN reporting_abstracts a ON a.pmid = p.pmid WHERE a.pmid IS NULL +-- so the import path *assumes* one-row-per-pmid. The schema never +-- enforced it. This migration codifies the assumption, mirroring the +-- analysis_nih fix from March (PR #71/#72 after the Dec 2025 duplicate +-- loading incident). +-- +-- PRECONDITION: +-- reporting_abstracts must contain zero duplicate pmids. The +-- information_schema-guarded block at the top aborts the migration with a +-- readable error if duplicates remain (run update/repairAbstracts.py +-- first; it warns when duplicates are present). +-- +-- Safe to run on prod and dev. Idempotent (information_schema guard; +-- re-runs are no-ops once the UNIQUE KEY exists). No AFTER clause; the +-- ALTER converts the existing BTREE index in place. +-- ============================================================================= + +SET @db = DATABASE(); + +-- ----------------------------------------------------------------------------- +-- Precondition: no duplicate pmids. +-- ----------------------------------------------------------------------------- + +SET @dup_count = ( + SELECT COUNT(*) FROM ( + SELECT pmid FROM reporting_abstracts + GROUP BY pmid HAVING COUNT(*) > 1 + ) d +); + +SET @sql = IF( + @dup_count > 0, + CONCAT( + 'SELECT ', + '''Migration aborted: reporting_abstracts has ', + @dup_count, + ' duplicate pmid value(s). Run update/repairAbstracts.py and resolve ', + 'duplicates before re-running this migration.'' AS error, ', + '1/0 AS force_error' + ), + 'SELECT ''No duplicate pmids; precondition satisfied.'' AS status' +); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +-- ----------------------------------------------------------------------------- +-- reporting_abstracts.idx_pmid: KEY -> UNIQUE KEY +-- ----------------------------------------------------------------------------- + +SET @already_unique = ( + SELECT COUNT(*) FROM information_schema.statistics + WHERE table_schema = @db + AND table_name = 'reporting_abstracts' + AND index_name = 'idx_pmid' + AND non_unique = 0 +); + +SET @sql = IF( + @already_unique > 0, + 'SELECT ''reporting_abstracts.idx_pmid is already UNIQUE; no-op.''', + 'ALTER TABLE reporting_abstracts + DROP INDEX idx_pmid, + ADD UNIQUE KEY idx_pmid (pmid) USING BTREE' +); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +-- ----------------------------------------------------------------------------- +-- Verification +-- ----------------------------------------------------------------------------- + +SELECT table_name, index_name, non_unique, column_name, index_type +FROM information_schema.statistics +WHERE table_schema = DATABASE() + AND table_name = 'reporting_abstracts' + AND index_name = 'idx_pmid'; diff --git a/setup/createDatabaseTableReciterDb.sql b/setup/createDatabaseTableReciterDb.sql index cdedbd7..91c8641 100644 --- a/setup/createDatabaseTableReciterDb.sql +++ b/setup/createDatabaseTableReciterDb.sql @@ -801,7 +801,7 @@ CREATE TABLE IF NOT EXISTS `reporting_abstracts` ( `abstract` blob DEFAULT NULL, `abstractVarchar` varchar(15000) DEFAULT NULL, PRIMARY KEY (`id`), - KEY `idx_pmid` (`pmid`) USING BTREE + UNIQUE KEY `idx_pmid` (`pmid`) USING BTREE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; CREATE TABLE IF NOT EXISTS `reporting_ad_hoc_feature_generator_execution` ( diff --git a/update/auditAbstracts.py b/update/auditAbstracts.py new file mode 100644 index 0000000..b56dbcb --- /dev/null +++ b/update/auditAbstracts.py @@ -0,0 +1,318 @@ +""" +auditAbstracts.py -- one-shot forensic audit of reporting_abstracts. + +Pulls rows where LENGTH(abstract) >= AUDIT_LENGTH_THRESHOLD, fetches the +DynamoDB ground truth for each PMID via the same path abstractImport.py +uses, and classifies each row: + + CLEAN DB matches Dynamo (long but legitimate abstract). + PREFIX_CORRUPTED First ~150 chars of the Dynamo abstract appear near + the start of the DB blob and DB is substantially + longer than Dynamo -- the cross-paper concatenation + pattern produced by the old CSV / LOAD DATA path. + DISJOINT DB front does not match Dynamo front; needs manual + review. + MISSING_IN_DYNAMO DynamoDB has no PubMedArticle record for the PMID. + EMPTY_IN_DYNAMO Record present but yields empty abstract. + +Outputs: + - audit_abstracts.csv one row per PMID examined + - audit_abstracts_dump.txt full text dump of the top N corrupted rows + - per-verdict counters and worst-offender summary to stdout + +Read-only. Does not modify reporting_abstracts. + +Env: + DB_USERNAME, DB_PASSWORD, DB_HOST, DB_NAME + AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_DEFAULT_REGION + AUDIT_LENGTH_THRESHOLD (default 4000) + AUDIT_MAX_CANDIDATES (default 1000) +""" + +import concurrent.futures +import csv +import logging +import os +import sys +import time + +import boto3 +import pymysql.cursors +import pymysql.err + + +DB_USERNAME = os.getenv("DB_USERNAME") +DB_PASSWORD = os.getenv("DB_PASSWORD") +DB_HOST = os.getenv("DB_HOST") +DB_NAME = os.getenv("DB_NAME") + +LENGTH_THRESHOLD = int(os.getenv("AUDIT_LENGTH_THRESHOLD", "4000")) +MAX_CANDIDATES = int(os.getenv("AUDIT_MAX_CANDIDATES", "1000")) + +CHUNK_SIZE = 100 +MAX_WORKERS = 5 +MAX_UNPROCESSED_RETRIES = 8 + +OUTPUT_CSV = "audit_abstracts.csv" +DUMP_FILE = "audit_abstracts_dump.txt" +DUMP_TOP_N = 5 + +# Compare on the first HEAD_SAMPLE chars of the Dynamo abstract; require +# it to be found within the first HEAD_SEARCH_WINDOW chars of the DB blob. +# Short enough to tolerate leading-character noise (the orphan `"` and +# similar CSV artifacts), long enough to be specific. +HEAD_SAMPLE = 150 +HEAD_SEARCH_WINDOW = 400 +# A DB blob this much longer than Dynamo is the concatenation signal. +LENGTH_INFLATION_RATIO = 1.3 +# BLOB-cap rule: a row right at the column cap with a Dynamo abstract many +# times smaller is the parser-desync fingerprint regardless of whether the +# first 150 chars happen to match (PubMed sometimes updated section labels +# between the original CSV load and now, which can defeat the head-string +# match). +BLOB_CAP_THRESHOLD = 60000 +BLOB_CAP_INFLATION_RATIO = 5 + +logging.basicConfig( + stream=sys.stdout, + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger(__name__) +logging.getLogger("botocore").setLevel(logging.WARNING) +logging.getLogger("boto3").setLevel(logging.WARNING) + + +def connect_mysql(): + try: + return pymysql.connect( + user=DB_USERNAME, + password=DB_PASSWORD, + database=DB_NAME, + host=DB_HOST, + autocommit=True, + charset="utf8mb4", + cursorclass=pymysql.cursors.DictCursor, + ) + except pymysql.err.MySQLError as err: + logger.error(f"DB connection failed: {err}") + sys.exit(1) + + +def fetch_candidates(conn, threshold, max_rows): + sql = """ + SELECT pmid, LENGTH(abstract) AS db_len, abstract + FROM reporting_abstracts + WHERE LENGTH(abstract) >= %s + ORDER BY LENGTH(abstract) DESC + LIMIT %s + """ + with conn.cursor() as cur: + cur.execute(sql, (threshold, max_rows)) + rows = cur.fetchall() + for r in rows: + if isinstance(r["abstract"], (bytes, bytearray)): + r["abstract"] = r["abstract"].decode("utf-8", errors="replace") + r["abstract"] = r["abstract"].replace("\r\n", "\n") + return rows + + +def get_abstract(item): + """Same extraction logic as update/abstractImport.py:99.""" + medline_citation = item.get("pubmedarticle", {}).get("medlinecitation") + if not medline_citation: + return "" + article = medline_citation.get("article") + if not article: + return "" + publication_abstract = article.get("publicationAbstract") + if not publication_abstract: + return "" + abstract_texts = [] + for abstract_part in publication_abstract.get("abstractTexts", []): + label = abstract_part.get("abstractTextLabel") + text = abstract_part.get("abstractText") + if text: + label_text = f"{label}: " if label else "" + abstract_texts.append(label_text + text) + return " ".join(abstract_texts) if abstract_texts else "" + + +def fetch_abstracts_from_dynamo(pmids): + client = boto3.resource("dynamodb").meta.client + + def fetch_chunk(chunk): + request_keys = [{"pmid": p} for p in chunk] + results = {} + present = set() + attempt = 0 + while request_keys: + response = client.batch_get_item( + RequestItems={"PubMedArticle": {"Keys": request_keys}} + ) + for item in response["Responses"].get("PubMedArticle", []): + pmid = item.get("pmid") + if pmid is not None: + present.add(pmid) + results[pmid] = get_abstract(item) + request_keys = ( + response.get("UnprocessedKeys", {}) + .get("PubMedArticle", {}) + .get("Keys", []) + ) + if request_keys: + attempt += 1 + if attempt > MAX_UNPROCESSED_RETRIES: + logger.warning( + f"{len(request_keys)} keys still unprocessed after " + f"{MAX_UNPROCESSED_RETRIES} retries; skipping remainder." + ) + break + time.sleep(min(0.1 * (2 ** attempt), 5.0)) + return results, present + + chunks = [pmids[i:i + CHUNK_SIZE] for i in range(0, len(pmids), CHUNK_SIZE)] + all_results = {} + found = set() + with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex: + futures = [ex.submit(fetch_chunk, c) for c in chunks] + for f in concurrent.futures.as_completed(futures): + res, present = f.result() + all_results.update(res) + found.update(present) + return all_results, found + + +def classify(db_abs, dyn_abs, dyn_present): + if not dyn_present: + return "MISSING_IN_DYNAMO" + if not dyn_abs: + return "EMPTY_IN_DYNAMO" + + db_norm = db_abs.strip() + dyn_norm = dyn_abs.strip() + if db_norm == dyn_norm: + return "CLEAN" + + db_len = len(db_norm) + dyn_len = len(dyn_norm) + + # Allow tiny tail differences (trailing whitespace/punctuation, an + # extra character or two) without flagging as corruption. + if abs(db_len - dyn_len) <= 5 and db_norm[: min(db_len, dyn_len) - 5 if db_len > 5 else db_len].lstrip('"') == dyn_norm[: min(db_len, dyn_len) - 5 if db_len > 5 else dyn_len].lstrip('"'): + return "CLEAN" + + head_sample = dyn_norm[:HEAD_SAMPLE] + if head_sample and head_sample in db_norm[:HEAD_SEARCH_WINDOW]: + if db_len > dyn_len * LENGTH_INFLATION_RATIO: + return "PREFIX_CORRUPTED" + return "CLEAN" + + if db_len >= BLOB_CAP_THRESHOLD and db_len > dyn_len * BLOB_CAP_INFLATION_RATIO: + return "PREFIX_CORRUPTED" + + return "DISJOINT" + + +def safe_oneline(s, n): + return s[:n].replace("\n", " ").replace("\t", " ").replace("\r", " ") + + +def main(): + logger.info( + f"Audit: LENGTH(abstract) >= {LENGTH_THRESHOLD}; " + f"max candidates: {MAX_CANDIDATES}" + ) + conn = connect_mysql() + try: + candidates = fetch_candidates(conn, LENGTH_THRESHOLD, MAX_CANDIDATES) + finally: + conn.close() + + logger.info(f"Candidates from reporting_abstracts: {len(candidates)}") + if not candidates: + logger.info("Nothing above threshold; exiting.") + return + + lens = sorted(c["db_len"] for c in candidates) + logger.info( + f"DB length distribution: min={lens[0]} " + f"p50={lens[len(lens) // 2]} p95={lens[int(len(lens) * 0.95)]} " + f"max={lens[-1]}" + ) + + pmids = [c["pmid"] for c in candidates] + dyn_abstracts, dyn_present = fetch_abstracts_from_dynamo(pmids) + logger.info( + f"DynamoDB returned records for {len(dyn_present)} / {len(pmids)} PMIDs" + ) + + rows = [] + counters = { + "CLEAN": 0, + "PREFIX_CORRUPTED": 0, + "DISJOINT": 0, + "MISSING_IN_DYNAMO": 0, + "EMPTY_IN_DYNAMO": 0, + } + for c in candidates: + pmid = c["pmid"] + db_abs = c["abstract"] + present = pmid in dyn_present + dyn_abs = dyn_abstracts.get(pmid, "") + verdict = classify(db_abs, dyn_abs, present) + counters[verdict] += 1 + rows.append({ + "pmid": pmid, + "db_len": c["db_len"], + "dyn_len": len(dyn_abs) if present else "", + "verdict": verdict, + "db_head": safe_oneline(db_abs, 80), + "db_tail": safe_oneline(db_abs[-80:] if len(db_abs) >= 80 else db_abs, 80), + "dyn_head": safe_oneline(dyn_abs, 80) if present else "", + }) + + with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=list(rows[0].keys())) + writer.writeheader() + writer.writerows(rows) + logger.info(f"Per-row audit written to {OUTPUT_CSV}") + + logger.info("Verdict counts:") + for k in ("CLEAN", "PREFIX_CORRUPTED", "DISJOINT", + "MISSING_IN_DYNAMO", "EMPTY_IN_DYNAMO"): + logger.info(f" {k:18s} {counters[k]}") + + suspect = [r for r in rows if r["verdict"] in ("PREFIX_CORRUPTED", "DISJOINT")] + suspect.sort(key=lambda r: r["db_len"], reverse=True) + + if suspect: + with open(DUMP_FILE, "w", encoding="utf-8") as f: + for r in suspect[:DUMP_TOP_N]: + pmid = r["pmid"] + db_abs = next(c["abstract"] for c in candidates if c["pmid"] == pmid) + dyn_abs = dyn_abstracts.get(pmid, "") + f.write("=" * 80 + "\n") + f.write( + f"pmid={pmid} verdict={r['verdict']} " + f"db_len={r['db_len']} dyn_len={r['dyn_len']}\n" + ) + f.write("--- DB (full) ---\n") + f.write(db_abs + "\n") + f.write("--- Dynamo (full) ---\n") + f.write(dyn_abs + "\n\n") + logger.info(f"Top {DUMP_TOP_N} suspects dumped to {DUMP_FILE}") + + logger.info(f"Top {min(DUMP_TOP_N, len(suspect))} suspects (summary):") + for r in suspect[:DUMP_TOP_N]: + logger.info( + f" pmid={r['pmid']:>9} verdict={r['verdict']:17s} " + f"db_len={r['db_len']:>6} dyn_len={r['dyn_len']}" + ) + logger.info(f" db_head : {r['db_head']!r}") + logger.info(f" db_tail : {r['db_tail']!r}") + logger.info(f" dyn_head : {r['dyn_head']!r}") + + +if __name__ == "__main__": + main() diff --git a/update/repairAbstracts.py b/update/repairAbstracts.py new file mode 100644 index 0000000..30f20aa --- /dev/null +++ b/update/repairAbstracts.py @@ -0,0 +1,339 @@ +""" +repairAbstracts.py -- one-shot cleanup of reporting_abstracts rows flagged +as corrupted by update/auditAbstracts.py. + +Reads audit_abstracts.csv (the audit output) and: + 1. Backs up the affected rows to reporting_abstracts_corrupt_backup_. + 2. Deletes the corrupted rows from reporting_abstracts in batches. + 3. Dedupes any remaining pmids that have multiple rows by keeping the + row with MIN(id) and backing up the rest to the same backup table. + (Precondition for the v1.4 UNIQUE KEY migration.) + 4. Verifies post-state row counts and confirms no duplicate pmids remain. + +After this script runs, the next nightly update/abstractImport.py will +re-fetch the deleted PMIDs cleanly via the parameterized executemany +path introduced in PR #78. + +Destructive. Requires --apply to perform the delete; without --apply it +runs in dry-run mode (counts only, no writes). + +Env: + DB_USERNAME, DB_PASSWORD, DB_HOST, DB_NAME +""" + +import argparse +import csv +import datetime +import logging +import os +import re +import sys + +import pymysql.cursors +import pymysql.err + + +INVALID_VERDICTS = {"PREFIX_CORRUPTED", "DISJOINT", "EMPTY_IN_DYNAMO"} +DEFAULT_AUDIT_CSV = "audit_abstracts.csv" +DEFAULT_BATCH_SIZE = 500 + +# Identifier safety: the backup-table suffix is timestamp-derived, but +# allow callers to override with --backup-table; whitelist the shape to +# refuse anything that would require quoting. +SAFE_IDENT = re.compile(r"^[A-Za-z_][A-Za-z0-9_]{0,63}$") + +logging.basicConfig( + stream=sys.stdout, + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", +) +logger = logging.getLogger(__name__) + + +def connect_mysql(): + try: + return pymysql.connect( + user=os.getenv("DB_USERNAME"), + password=os.getenv("DB_PASSWORD"), + database=os.getenv("DB_NAME"), + host=os.getenv("DB_HOST"), + autocommit=True, + charset="utf8mb4", + cursorclass=pymysql.cursors.DictCursor, + ) + except pymysql.err.MySQLError as err: + logger.error(f"DB connection failed: {err}") + sys.exit(1) + + +def read_invalid_pmids(audit_csv): + if not os.path.exists(audit_csv): + logger.error(f"Audit CSV not found: {audit_csv}") + logger.error("Run update/auditAbstracts.py first.") + sys.exit(1) + with open(audit_csv) as f: + rows = list(csv.DictReader(f)) + if not rows or "verdict" not in rows[0] or "pmid" not in rows[0]: + logger.error(f"{audit_csv} is missing required columns (pmid, verdict).") + sys.exit(1) + return sorted({ + int(r["pmid"]) for r in rows if r["verdict"] in INVALID_VERDICTS + }) + + +def count_matching(cur, pmids, batch=5000): + """COUNT(*) of rows whose pmid is in `pmids`, batched to avoid + oversized IN-lists. Returns the sum across batches.""" + total = 0 + for i in range(0, len(pmids), batch): + chunk = pmids[i:i + batch] + placeholders = ",".join(["%s"] * len(chunk)) + cur.execute( + f"SELECT COUNT(*) AS c FROM reporting_abstracts " + f"WHERE pmid IN ({placeholders})", + chunk, + ) + total += cur.fetchone()["c"] + return total + + +def backup_rows(cur, pmids, backup_table, batch): + cur.execute(f"CREATE TABLE `{backup_table}` LIKE reporting_abstracts") + inserted = 0 + for i in range(0, len(pmids), batch): + chunk = pmids[i:i + batch] + placeholders = ",".join(["%s"] * len(chunk)) + cur.execute( + f"INSERT INTO `{backup_table}` " + f"SELECT * FROM reporting_abstracts WHERE pmid IN ({placeholders})", + chunk, + ) + inserted += cur.rowcount + if (i // batch) % 5 == 0: + logger.info(f" ... backed up {inserted:,} rows") + return inserted + + +def delete_rows(cur, pmids, batch): + deleted = 0 + for i in range(0, len(pmids), batch): + chunk = pmids[i:i + batch] + placeholders = ",".join(["%s"] * len(chunk)) + cur.execute( + f"DELETE FROM reporting_abstracts WHERE pmid IN ({placeholders})", + chunk, + ) + deleted += cur.rowcount + if (i // batch) % 5 == 0: + logger.info(f" ... deleted {deleted:,} rows") + return deleted + + +def find_duplicate_pmids(cur, limit=10): + cur.execute( + "SELECT pmid, COUNT(*) AS c FROM reporting_abstracts " + "GROUP BY pmid HAVING c > 1 LIMIT %s", + (limit,), + ) + return cur.fetchall() + + +def count_duplicate_extras(cur): + """Returns (group_count, extra_row_count). extra_row_count is the number + of rows that would need to be deleted to leave one row per pmid.""" + cur.execute( + "SELECT COUNT(*) AS groups, COALESCE(SUM(c - 1), 0) AS extras FROM (" + " SELECT COUNT(*) AS c FROM reporting_abstracts GROUP BY pmid HAVING c > 1" + ") d" + ) + r = cur.fetchone() + return r["groups"], r["extras"] + + +def backup_duplicate_extras(cur, backup_table): + """Insert into the backup table every duplicate row except the MIN(id) + keeper for each pmid. Returns the number of rows backed up.""" + cur.execute( + f"INSERT INTO `{backup_table}` " + "SELECT ra.* FROM reporting_abstracts ra " + "JOIN (" + " SELECT pmid, MIN(id) AS keep_id FROM reporting_abstracts " + " GROUP BY pmid HAVING COUNT(*) > 1" + ") k ON k.pmid = ra.pmid AND ra.id <> k.keep_id" + ) + return cur.rowcount + + +def delete_duplicate_extras(cur): + """Delete every duplicate row except the MIN(id) keeper for each pmid. + Returns the number of rows deleted.""" + cur.execute( + "DELETE ra FROM reporting_abstracts ra " + "JOIN (" + " SELECT pmid, MIN(id) AS keep_id FROM reporting_abstracts " + " GROUP BY pmid HAVING COUNT(*) > 1" + ") k ON k.pmid = ra.pmid AND ra.id <> k.keep_id" + ) + return cur.rowcount + + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--audit-csv", default=DEFAULT_AUDIT_CSV, + help=f"Audit CSV path (default {DEFAULT_AUDIT_CSV})") + parser.add_argument("--apply", action="store_true", + help="Perform the delete. Without this flag, dry-run only.") + parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE, + help=f"PMIDs per statement (default {DEFAULT_BATCH_SIZE})") + parser.add_argument("--backup-table", default=None, + help="Backup table name (default: reporting_abstracts_corrupt_backup_)") + args = parser.parse_args() + + pmids = read_invalid_pmids(args.audit_csv) + logger.info(f"Read {len(pmids):,} invalid PMIDs from {args.audit_csv}") + if not pmids: + logger.info("Nothing to repair.") + return + + backup_table = args.backup_table or ( + f"reporting_abstracts_corrupt_backup_" + f"{datetime.datetime.now():%Y%m%d_%H%M%S}" + ) + if not SAFE_IDENT.match(backup_table): + logger.error(f"Refusing unsafe backup-table identifier: {backup_table!r}") + sys.exit(1) + + conn = connect_mysql() + try: + with conn.cursor() as cur: + cur.execute("SELECT COUNT(*) AS c FROM reporting_abstracts") + before_total = cur.fetchone()["c"] + + matching = count_matching(cur, pmids) + logger.info( + f"reporting_abstracts: {before_total:,} rows total; " + f"{matching:,} rows match the invalid-PMID list." + ) + + if matching > len(pmids): + logger.info( + f"Live matches ({matching:,}) > unique PMIDs ({len(pmids):,}): " + f"{matching - len(pmids):,} of the audited PMIDs have multiple " + "rows in the live table (all of which will be deleted by the IN clause)." + ) + elif matching < len(pmids): + logger.warning( + f"Live matches ({matching:,}) < unique PMIDs ({len(pmids):,}): " + f"{len(pmids) - matching:,} audited PMIDs no longer present " + "(already deleted or table changed). Proceeding with what is live." + ) + + dupe_groups, dupe_extras = count_duplicate_extras(cur) + logger.info( + f"Duplicate-pmid groups: {dupe_groups:,} " + f"({dupe_extras:,} extra rows would be deduped after the corruption delete)." + ) + + if not args.apply: + cur.execute( + "SELECT pmid, LENGTH(abstract) AS db_len FROM reporting_abstracts " + "WHERE LENGTH(abstract) >= 4000 ORDER BY LENGTH(abstract) DESC LIMIT 3" + ) + samples = cur.fetchall() + logger.info("Sample of longest current rows (pre-repair):") + for s in samples: + logger.info(f" pmid={s['pmid']:>9} db_len={s['db_len']}") + logger.info(f"Would back up to: `{backup_table}`") + logger.info( + f"Would delete {matching:,} corrupted rows + dedupe " + f"{dupe_extras:,} duplicate-extras (keep MIN(id) per pmid)." + ) + logger.info("DRY RUN -- no changes made. Re-run with --apply to perform the repair.") + return + + logger.info(f"Creating backup table `{backup_table}` ...") + backed_up = backup_rows(cur, pmids, backup_table, args.batch_size) + logger.info(f"Backed up {backed_up:,} rows to `{backup_table}`.") + if backed_up != matching: + logger.error( + f"Backup row count {backed_up:,} != expected {matching:,}. Aborting." + ) + sys.exit(1) + + logger.info("Deleting corrupted rows from reporting_abstracts ...") + deleted = delete_rows(cur, pmids, args.batch_size) + + cur.execute("SELECT COUNT(*) AS c FROM reporting_abstracts") + after_total = cur.fetchone()["c"] + logger.info( + f"Deleted {deleted:,} rows; reporting_abstracts now has " + f"{after_total:,} rows (was {before_total:,})." + ) + if before_total - after_total != deleted: + logger.error( + f"Row-count delta mismatch: before-after={before_total - after_total}, " + f"deleted={deleted}. Backup table `{backup_table}` is intact." + ) + sys.exit(1) + + cur.execute( + "SELECT COUNT(*) AS c FROM reporting_abstracts WHERE LENGTH(abstract) >= 4000" + ) + long_remaining = cur.fetchone()["c"] + logger.info( + f"Rows with LENGTH(abstract) >= 4000 remaining: {long_remaining:,} " + "(should approximately equal the CLEAN count from the audit)." + ) + + cur.execute( + "SELECT COUNT(*) AS c FROM reporting_abstracts WHERE LENGTH(abstract) >= 60000" + ) + cap_remaining = cur.fetchone()["c"] + logger.info( + f"Rows at/above 60K (BLOB-cap region) remaining: {cap_remaining:,} " + "(should be 0 if repair caught all corruption)." + ) + + dupe_groups_after, dupe_extras_after = count_duplicate_extras(cur) + if dupe_extras_after > 0: + logger.info( + f"Phase 2: deduping {dupe_extras_after:,} extra rows across " + f"{dupe_groups_after:,} pmid groups (keeping MIN(id) per pmid)..." + ) + backed_up_dupes = backup_duplicate_extras(cur, backup_table) + logger.info(f" ... backed up {backed_up_dupes:,} duplicate rows to `{backup_table}`.") + if backed_up_dupes != dupe_extras_after: + logger.error( + f"Dedup backup count {backed_up_dupes:,} != expected {dupe_extras_after:,}. " + "Aborting before delete." + ) + sys.exit(1) + deleted_dupes = delete_duplicate_extras(cur) + logger.info(f" ... deleted {deleted_dupes:,} duplicate rows.") + if deleted_dupes != dupe_extras_after: + logger.error( + f"Dedup delete count {deleted_dupes:,} != expected {dupe_extras_after:,}. " + f"Backup table `{backup_table}` is intact." + ) + sys.exit(1) + else: + logger.info("Phase 2: no duplicates to dedupe.") + + dupes = find_duplicate_pmids(cur) + if dupes: + logger.error( + f"{len(dupes)} duplicate pmid(s) still present after dedup (sample): " + f"{[(d['pmid'], d['c']) for d in dupes]}" + ) + sys.exit(1) + else: + logger.info( + "No duplicate pmids remain. Safe to apply " + "setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql." + ) + finally: + conn.close() + + +if __name__ == "__main__": + main() From 8e82e7a0c8b65d0295103a69e0bb2c983502f507 Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Wed, 20 May 2026 17:53:55 -0400 Subject: [PATCH 14/19] fix: repair script handles generated columns; migration guard actually halts Two bugs discovered when running the runbook end to end against prod: 1. update/repairAbstracts.py used `CREATE TABLE LIKE` + `INSERT INTO ... SELECT *`. Prod added a STORED generated column `abstract_len` (with a composite index `idx_abs_pmid_len`) directly without updating the repo's schema file. The SELECT * pulled the generated-column value; MariaDB's strict mode rejected it with error 1906 and the backup aborted with the destination table empty. The fix queries information_schema for non-generated columns and enumerates them in both the corrupted-rows INSERT and the dedup INSERT. The script now works whether or not abstract_len exists. 2. setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql's precondition guard used `SELECT 'aborted...' AS error, 1/0 AS force_error`. The 1/0 only emits a warning in MariaDB's default SQL mode -- it does NOT halt the script. So when the DBA uploaded the migration before running the repair, the precondition SELECT was printed but execution continued to the ALTER, which then failed with `Duplicate entry '9182809' for key 'idx_pmid'`. The fix synthesizes a SELECT against a non-existent table whose name encodes the duplicate count (`__migration_aborted_reporting_abstracts_has_N_duplicate_pmids__ _run_update_repairAbstracts_py_first`). The resulting "Table doesn't exist" error halts the SQL client immediately, and the error text itself tells the operator what to do. Schema drift (the live abstract_len + idx_abs_pmid_len that aren't in createDatabaseTableReciterDb.sql) is a separate concern not addressed here. --- ...r_add_uq_pmid_reporting_abstracts_v1.4.sql | 14 ++++++---- update/repairAbstracts.py | 27 ++++++++++++++++--- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql b/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql index a6e3211..abff544 100644 --- a/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql +++ b/setup/alter_add_uq_pmid_reporting_abstracts_v1.4.sql @@ -29,6 +29,13 @@ SET @db = DATABASE(); -- ----------------------------------------------------------------------------- -- Precondition: no duplicate pmids. +-- +-- If duplicates exist, the precondition synthesizes a SELECT against a +-- non-existent table whose name encodes the duplicate count. The resulting +-- "Table doesn't exist" error halts execution (SELECT-with-1/0 only emits +-- a warning, which MariaDB ignored outside a stored program in v1 of this +-- migration -- the cleanup was attempted, ALTER ran anyway, ALTER failed +-- on the first duplicate pmid). -- ----------------------------------------------------------------------------- SET @dup_count = ( @@ -41,12 +48,9 @@ SET @dup_count = ( SET @sql = IF( @dup_count > 0, CONCAT( - 'SELECT ', - '''Migration aborted: reporting_abstracts has ', + 'SELECT 1 FROM `__migration_aborted_reporting_abstracts_has_', @dup_count, - ' duplicate pmid value(s). Run update/repairAbstracts.py and resolve ', - 'duplicates before re-running this migration.'' AS error, ', - '1/0 AS force_error' + '_duplicate_pmids__run_update_repairAbstracts_py_first`' ), 'SELECT ''No duplicate pmids; precondition satisfied.'' AS status' ); diff --git a/update/repairAbstracts.py b/update/repairAbstracts.py index 30f20aa..3f9466d 100644 --- a/update/repairAbstracts.py +++ b/update/repairAbstracts.py @@ -97,15 +97,31 @@ def count_matching(cur, pmids, batch=5000): return total +def writable_columns(cur, table="reporting_abstracts"): + """Return the list of non-generated columns (those that accept INSERT). + Prod has a STORED generated column abstract_len that cannot be assigned; + INSERT must enumerate the real columns explicitly.""" + cur.execute( + "SELECT column_name FROM information_schema.columns " + "WHERE table_schema = DATABASE() AND table_name = %s " + " AND (extra IS NULL OR extra NOT LIKE '%%GENERATED%%') " + "ORDER BY ordinal_position", + (table,), + ) + return [r["column_name"] for r in cur.fetchall()] + + def backup_rows(cur, pmids, backup_table, batch): cur.execute(f"CREATE TABLE `{backup_table}` LIKE reporting_abstracts") + cols = writable_columns(cur) + col_list = ", ".join(f"`{c}`" for c in cols) inserted = 0 for i in range(0, len(pmids), batch): chunk = pmids[i:i + batch] placeholders = ",".join(["%s"] * len(chunk)) cur.execute( - f"INSERT INTO `{backup_table}` " - f"SELECT * FROM reporting_abstracts WHERE pmid IN ({placeholders})", + f"INSERT INTO `{backup_table}` ({col_list}) " + f"SELECT {col_list} FROM reporting_abstracts WHERE pmid IN ({placeholders})", chunk, ) inserted += cur.rowcount @@ -153,9 +169,12 @@ def count_duplicate_extras(cur): def backup_duplicate_extras(cur, backup_table): """Insert into the backup table every duplicate row except the MIN(id) keeper for each pmid. Returns the number of rows backed up.""" + cols = writable_columns(cur) + col_list = ", ".join(f"`{c}`" for c in cols) + select_list = ", ".join(f"ra.`{c}`" for c in cols) cur.execute( - f"INSERT INTO `{backup_table}` " - "SELECT ra.* FROM reporting_abstracts ra " + f"INSERT INTO `{backup_table}` ({col_list}) " + f"SELECT {select_list} FROM reporting_abstracts ra " "JOIN (" " SELECT pmid, MIN(id) AS keep_id FROM reporting_abstracts " " GROUP BY pmid HAVING COUNT(*) > 1" From b3eaf6d25c105d62f2c53c95360451b944a8f54e Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Mon, 8 Jun 2026 19:31:22 -0400 Subject: [PATCH 15/19] feat(setup): add durable authorship_review table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Durable curator-state table for the Publication Manager 'Authorships' review queue (Curator_All). Survives the nightly truncate-reload like grant_provenance — it is in no truncate list (updateReciterDB.py all_tables) and touched by no stored procedure; populated externally by the adversarial-attribution-review producer. CREATE TABLE IF NOT EXISTS, additive, no change to existing ETL. --- setup/table_authorship_review.sql | 82 +++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 setup/table_authorship_review.sql diff --git a/setup/table_authorship_review.sql b/setup/table_authorship_review.sql new file mode 100644 index 0000000..366986c --- /dev/null +++ b/setup/table_authorship_review.sql @@ -0,0 +1,82 @@ +-- ----------------------------------------------------------------------------- +-- authorship_review — Publication Manager "Authorships" review queue +-- ----------------------------------------------------------------------------- +-- DURABLE TABLE — survives the nightly truncate-reload. Like `grant_provenance` +-- (the (person,pmid,grant) audit log) and the `admin_*` tables, this is curator +-- state, NOT a reporting export. It MUST NOT be added to any truncate list +-- (see update/updateReciterDB.py `all_tables`) and is not touched by any nightly +-- stored procedure or ETL step. CREATE TABLE IF NOT EXISTS so re-applying is safe. +-- +-- One row per WCM-affiliated AUTHORSHIP (a PubMed author carrying a WCM affiliation +-- on a publication) that is NOT yet assigned to any identity. Powers the Curator_All +-- `/authorships` tab in ReCiter-Publication-Manager (reads this table via Sequelize). +-- +-- POPULATED EXTERNALLY (this repo's ETL cannot compute the scores). The producer is +-- the adversarial-attribution-review pipeline in the ReCiter Research project +-- (scripts/aar_orchestrator.py -> aar_db.py upsert), which runs the gate (reciterdb +-- analysis_summary_author = accepted set), the identity matcher (reciterdb identity), +-- and the pinned XGBoost 3.2.0 models over the S3 scoring inputs to compute the +-- feedback-identity (FG) and identity-only (IO) scores per authorship. Monthly cron. +-- +-- Classification per authorship (the producer sets it): +-- absent top candidate never scored by production (no person_article row) +-- suggested top candidate production final (FG) >= 30 — already in a pending queue +-- buried top candidate FG < 30 (IO can be high) — production buried it +-- assigned reserved (accepted rows are excluded by the gate, not stored here) +-- +-- single_candidate = exactly one WCM identity matches the author's surname + +-- given/initial (cohort_size == 1) — the strongest precision signal; such rows are +-- near-certain and form the high-precision review lane. +-- +-- Refresh contract: the producer UPSERTs by author_key, refreshing the scoring/ +-- classification columns and `last_refreshed`; it NEVER overwrites a curator-set +-- `status` (assigned/accepted/rejected/dismissed/snoozed) or its resolution_cwid/ +-- reviewer/note/snooze_until, and `first_seen` is set once and never overwritten. +-- ----------------------------------------------------------------------------- + +CREATE TABLE IF NOT EXISTS `authorship_review` ( + `id` BIGINT NOT NULL AUTO_INCREMENT, + `pmid` BIGINT NOT NULL, + `author_key` VARCHAR(32) NOT NULL, -- `pmid:position` + `author_position` INT NULL, + `author_position_label` VARCHAR(8) NULL, -- first/middle/last + `wcm_author` VARCHAR(255) NULL, -- PubMed author name + `author_affiliation` TEXT NULL, + `entrez_date` DATE NULL, -- ReCiter entrez add date + `title` TEXT NULL, + `journal` VARCHAR(512) NULL, + `doi` VARCHAR(255) NULL, + `classification` ENUM('assigned','suggested','buried','absent') NULL, + `top_cwid` VARCHAR(32) NULL, -- proposed identity + `top_name` VARCHAR(255) NULL, + `top_person_type` VARCHAR(64) NULL, + `top_dept` VARCHAR(255) NULL, + `top_fg_score` FLOAT NULL, -- production final (FG) + `top_io_score` FLOAT NULL, -- identity-only (IO) + `top_confidence` FLOAT NULL, + `top_cohort_size` INT NULL, -- homonyms (surname+initial) + `top_given_match` VARCHAR(16) NULL, -- full|initial + `top_affil_match` TINYINT(1) NULL, + `n_candidates` INT NULL, + `single_candidate` TINYINT(1) NULL, -- cohort_size == 1 + `candidate_cwids_json` LONGTEXT NULL, -- ranked alternates + `status` ENUM('open','assigned','accepted','rejected','dismissed','snoozed') + NOT NULL DEFAULT 'open', -- curator state + `resolution_cwid` VARCHAR(32) NULL, + `reviewer` VARCHAR(64) NULL, + `note` TEXT NULL, + `snooze_until` DATE NULL, + `resolved_at` DATETIME NULL, + `first_seen` DATETIME NULL, -- set once, never overwritten + `last_refreshed` DATETIME NULL, + `last_checked` DATETIME NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `uq_author_key` (`author_key`), + KEY `ix_pmid` (`pmid`), + KEY `ix_classification` (`classification`), + KEY `ix_status` (`status`), + KEY `ix_single_candidate` (`single_candidate`), + KEY `ix_top_io_score` (`top_io_score`), + KEY `ix_entrez_date` (`entrez_date`), + KEY `ix_top_cwid` (`top_cwid`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; From b6f97dcc497906dcf73a68a693a75f9d507c6b5d Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Tue, 9 Jun 2026 07:56:56 -0400 Subject: [PATCH 16/19] feat(setup): add admin_users scope/proxy column migration (v1.5) Adds scope_person_types, scope_org_units, proxy_person_ids (JSON NULL) to admin_users via an idempotent information_schema-guarded ALTER, for existing databases that predate the fresh-build schema (#92, master). The Publication Manager dev-branch AdminUser model (commit 579d32f) selects these columns during login; without them findOrcreateAdminUser fails with 'Unknown column' and authentication returns 401 for every user. Run before deploying the PM dev branch against an existing reciterdb (e.g. production). Additive only; admin_users is durable (not in updateReciterDB.py all_tables truncate list). No-op on re-run. --- ...lter_add_admin_user_scope_columns_v1.5.sql | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 setup/alter_add_admin_user_scope_columns_v1.5.sql diff --git a/setup/alter_add_admin_user_scope_columns_v1.5.sql b/setup/alter_add_admin_user_scope_columns_v1.5.sql new file mode 100644 index 0000000..98e4877 --- /dev/null +++ b/setup/alter_add_admin_user_scope_columns_v1.5.sql @@ -0,0 +1,76 @@ +-- ============================================================================= +-- Migration: Add admin_users scope/proxy columns (v1.5) +-- ============================================================================= +-- Adds the three JSON scope columns the Publication Manager AdminUser model +-- now selects on every login: +-- - scope_person_types (JSON, nullable) — person-type curation scope +-- - scope_org_units (JSON, nullable) — org-unit curation scope +-- - proxy_person_ids (JSON, nullable) — proxied person identifiers +-- +-- WHY THIS MIGRATION EXISTS: +-- ReCiter-Publication-Manager (dev branch, model commit 579d32f +-- "extend AdminUser model with scope/proxy JSON columns") issues +-- SELECT userID, personIdentifier, ..., scope_person_types, +-- scope_org_units, proxy_person_ids FROM admin_users +-- inside findOrcreateAdminUser during authentication. If admin_users is +-- missing these columns the SELECT fails with ER_BAD_FIELD_ERROR +-- ("Unknown column 'scope_person_types'"), the authorize() call throws, and +-- login returns 401 for every user. The columns must exist before the PM +-- dev branch is deployed against this database. +-- +-- The fresh-build schema (setup/createDatabaseTableReciterDb.sql on master, +-- PR #92) already defines admin_users WITH these columns, so new databases +-- are fine. This migration brings EXISTING databases (e.g. the production +-- reciterdb, which predates #92 and has none of the three) up to that +-- schema. There was no ALTER path for existing DBs until now. +-- +-- DURABILITY: admin_users is curator state, not a reporting export. It is NOT +-- in update/updateReciterDB.py's truncate list (`all_tables`) and is not +-- touched by any nightly stored procedure or ETL step, so these columns +-- persist across nightly reload. +-- +-- Safe to run on prod and dev. Uses IF NOT EXISTS-style guards via an +-- information_schema check (no-op on re-run). Additive only — no existing +-- column or row is modified. Run BEFORE deploying the PM dev branch. +-- ============================================================================= + +-- ----------------------------------------------------------------------------- +-- admin_users: + scope_person_types + scope_org_units + proxy_person_ids +-- ----------------------------------------------------------------------------- + +SET @db = DATABASE(); + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'admin_users' + AND column_name = 'scope_person_types') = 0, + 'ALTER TABLE admin_users ADD COLUMN `scope_person_types` JSON DEFAULT NULL', + 'SELECT ''admin_users.scope_person_types already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'admin_users' + AND column_name = 'scope_org_units') = 0, + 'ALTER TABLE admin_users ADD COLUMN `scope_org_units` JSON DEFAULT NULL', + 'SELECT ''admin_users.scope_org_units already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +SET @sql = (SELECT IF( + (SELECT COUNT(*) FROM information_schema.columns + WHERE table_schema = @db AND table_name = 'admin_users' + AND column_name = 'proxy_person_ids') = 0, + 'ALTER TABLE admin_users ADD COLUMN `proxy_person_ids` JSON DEFAULT NULL', + 'SELECT ''admin_users.proxy_person_ids already exists''')); +PREPARE stmt FROM @sql; EXECUTE stmt; DEALLOCATE PREPARE stmt; + +-- ----------------------------------------------------------------------------- +-- Verification +-- ----------------------------------------------------------------------------- + +SELECT table_name, column_name, data_type, is_nullable +FROM information_schema.columns +WHERE table_schema = DATABASE() + AND table_name = 'admin_users' + AND column_name IN ('scope_person_types', 'scope_org_units', 'proxy_person_ids') +ORDER BY ordinal_position; From c856c2c7c027b06c21cf29d33120db82db9a2318 Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Sun, 14 Jun 2026 12:40:21 -0400 Subject: [PATCH 17/19] feat(etl): scan ArticleProvenance (DynamoDB) -> article_provenance table (#95) Add a nightly ETL step that scans the ReCiter DynamoDB ArticleProvenance table and loads first-retrieval provenance into a new reciterdb table, article_provenance, so Publication Manager can display the date a publication was first retrieved (PM #737). - update/retrieveArticleProvenance.py: streams the scan into an article_provenance_new staging table (INSERT IGNORE per page to bound memory and collapse duplicates), validates row counts against production, then atomic RENAME-swaps. Converts frd (epoch seconds, UTC) to DATETIME. Mirrors the retrieveNIH staging->swap pattern. - setup/createDatabaseTableReciterDb.sql: article_provenance DDL for fresh installs, keyed (pmid, personIdentifier). - setup/alter_add_article_provenance_v1.6.sql: migration for existing dev/prod databases. - update/run_all.py: run the step before nightly indexing as non-fatal so a failure cannot block the indexing SP. - Dockerfile: COPY the new script. - README: document the step and backfill the missing retrieveReporter row. The table is keyed on (pmid, personIdentifier): the source DynamoDB table has a composite key uid (HASH) + articleId (RANGE), one item per person+article, so frd is per-person -- not global-per-article as issue #95 assumed. --- Dockerfile | 1 + README.md | 13 +- setup/alter_add_article_provenance_v1.6.sql | 56 ++++ setup/createDatabaseTableReciterDb.sql | 17 + update/retrieveArticleProvenance.py | 353 ++++++++++++++++++++ update/run_all.py | 24 +- 6 files changed, 453 insertions(+), 11 deletions(-) create mode 100644 setup/alter_add_article_provenance_v1.6.sql create mode 100644 update/retrieveArticleProvenance.py diff --git a/Dockerfile b/Dockerfile index f4f344b..a388a25 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,6 +14,7 @@ ENV PYTHONUNBUFFERED=1 # Copy additional Python scripts COPY update/retrieveNIH.py ./ COPY update/retrieveReporter.py ./ +COPY update/retrieveArticleProvenance.py ./ COPY update/retrieveAltmetric.py ./ COPY update/retrieveArticles.py ./ COPY update/updateReciterDB.py ./ diff --git a/README.md b/README.md index c39a9e1..dc22384 100644 --- a/README.md +++ b/README.md @@ -69,12 +69,15 @@ CronJob (daily) ├─ 1. executeFeatureGenerator.py Trigger ReCiter ML scoring API ├─ 2. retrieveArticles.py Pull person/article data from S3 + DynamoDB ├─ 3. retrieveNIH.py NIH iCite API → analysis_nih (atomic swap) - ├─ 4. run_nightly_indexing.sh Run populateAnalysisSummaryTables_v2() + ├─ 4. retrieveReporter.py NIH RePORTER grants → grant_reporter_* (reconcile) + ├─ 5. retrieveArticleProvenance.py ArticleProvenance (DynamoDB) → article_provenance + │ └─ non-fatal: a failure here does not block nightly indexing + ├─ 6. run_nightly_indexing.sh Run populateAnalysisSummaryTables_v2() │ ├─ Polls analysis_job_log every 3s for progress │ ├─ Auto-retries 3x with 60s backoff │ └─ Auto-restores from backup on failure - ├─ 5. abstractImport.py PubMed abstracts from DynamoDB - └─ 6. conflictsImport.py COI statements from DynamoDB + ├─ 7. abstractImport.py PubMed abstracts from DynamoDB + └─ 8. conflictsImport.py COI statements from DynamoDB ``` **Key patterns:** @@ -105,6 +108,8 @@ ReCiterDB/ │ ├── run_nightly_indexing.sh # SP runner with monitoring/retry │ ├── retrieveArticles.py # S3 + DynamoDB article fetcher │ ├── retrieveNIH.py # NIH iCite fetcher (atomic swap) +│ ├── retrieveReporter.py # NIH RePORTER grants fetcher (reconcile) +│ ├── retrieveArticleProvenance.py # ArticleProvenance (DynamoDB) → article_provenance │ ├── retrieveAltmetric.py # Altmetric API fetcher │ ├── updateReciterDB.py # Bulk loader (LOAD DATA LOCAL INFILE) │ ├── dataTransformer.py # ReCiter JSON → CSV @@ -288,6 +293,8 @@ All defined in `setup/createEventsProceduresReciterDb.sql`. | `run_all.py` | EKS orchestrator: runs all pipeline steps in sequence with timeout enforcement, memory logging, and S3 log upload | | `retrieveArticles.py` | Fetches person and article data from S3 and DynamoDB in batches | | `retrieveNIH.py` | Fetches NIH iCite metrics in batches of 150; loads to staging table with validation and atomic swap | +| `retrieveReporter.py` | Fetches NIH RePORTER grants/linkages and reconciles them into `grant_reporter_*` / `grant_provenance` | +| `retrieveArticleProvenance.py` | Scans DynamoDB `ArticleProvenance` → `article_provenance` (pmid, personIdentifier) via staging + atomic swap; runs non-fatal | | `retrieveAltmetric.py` | Fetches Altmetric scores for articles published in the last 2 years | | `updateReciterDB.py` | Bulk data loader using `LOAD DATA LOCAL INFILE` with retry and reconnect logic | | `dataTransformer.py` | Transforms ReCiter JSON output to CSV format for all `person_*` tables | diff --git a/setup/alter_add_article_provenance_v1.6.sql b/setup/alter_add_article_provenance_v1.6.sql new file mode 100644 index 0000000..206ca9f --- /dev/null +++ b/setup/alter_add_article_provenance_v1.6.sql @@ -0,0 +1,56 @@ +-- ============================================================================= +-- Migration: Add article_provenance table (v1.6) +-- ============================================================================= +-- Creates the article_provenance table on EXISTING databases (dev, prod). Fresh +-- builds already get it from setup/createDatabaseTableReciterDb.sql (ReCiterDB#95). +-- +-- WHAT IT IS: +-- First-retrieval provenance per (article, person), loaded nightly by +-- update/retrieveArticleProvenance.py from the ReCiter DynamoDB +-- `ArticleProvenance` table. That source table has a COMPOSITE key +-- uid (HASH, personIdentifier) + articleId (RANGE, PMID), so there is one item +-- per person+article. This table mirrors that key exactly -- PRIMARY KEY +-- (pmid, personIdentifier) -- rather than collapsing to one row per PMID. +-- +-- Columns map from DynamoDB attributes: +-- pmid <- articleId (String PMID -> INT) +-- personIdentifier <- uid +-- firstRetrievalDate <- frd (epoch SECONDS, UTC -> DATETIME) +-- retrievalStrategy <- rs (PM_UI_SEARCH, PM_AUTHOR, ...) +-- source <- src (PM, CTSC, GS, MAN, MAN_FROM_PM, ...) +-- +-- WHY THIS MIGRATION EXISTS: +-- Publication Manager #737 displays "date a publication was first retrieved" +-- in /curate and reads this table by PMID. The table must exist before the PM +-- #737 branch is deployed against this database. +-- +-- DURABILITY / ETL CONTRACT: +-- Loaded via a staging table (article_provenance_new) + atomic RENAME swap by +-- the nightly ETL, exactly like analysis_nih. It is NOT in +-- update/updateReciterDB.py's truncate list and is not touched by any nightly +-- stored procedure. A failure in the ETL step leaves production untouched and +-- does not block the rest of run_all.py (the step runs as non-fatal). +-- +-- Safe to run on prod and dev. CREATE TABLE IF NOT EXISTS is a no-op on re-run +-- and additive only -- no existing table or row is modified. +-- ============================================================================= + +CREATE TABLE IF NOT EXISTS `article_provenance` ( + `pmid` int(11) NOT NULL, + `personIdentifier` varchar(128) NOT NULL, + `firstRetrievalDate` datetime DEFAULT NULL, + `retrievalStrategy` varchar(64) DEFAULT NULL, + `source` varchar(32) DEFAULT NULL, + PRIMARY KEY (`pmid`, `personIdentifier`), + KEY `idx_personIdentifier` (`personIdentifier`) USING BTREE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +-- ----------------------------------------------------------------------------- +-- Verification +-- ----------------------------------------------------------------------------- + +SELECT table_name, column_name, data_type, character_maximum_length, is_nullable, column_key +FROM information_schema.columns +WHERE table_schema = DATABASE() + AND table_name = 'article_provenance' +ORDER BY ordinal_position; diff --git a/setup/createDatabaseTableReciterDb.sql b/setup/createDatabaseTableReciterDb.sql index 91c8641..d5127fe 100644 --- a/setup/createDatabaseTableReciterDb.sql +++ b/setup/createDatabaseTableReciterDb.sql @@ -492,6 +492,23 @@ CREATE TABLE IF NOT EXISTS `analysis_temp_output_table_cell` ( KEY `personIdentifier` (`personIdentifier`) USING BTREE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +-- article_provenance: first-retrieval provenance per (article, person), loaded +-- nightly by update/retrieveArticleProvenance.py from the ReCiter DynamoDB +-- `ArticleProvenance` table (composite key uid + articleId). Keyed on +-- (pmid, personIdentifier) to mirror that source key exactly -- one row per +-- person+article. firstRetrievalDate is `frd` (epoch seconds, UTC) converted to +-- DATETIME. Loaded via staging->atomic-swap; NOT in any truncate list. +-- Consumed by Publication Manager #737 ("date a publication was first retrieved"). +CREATE TABLE IF NOT EXISTS `article_provenance` ( + `pmid` int(11) NOT NULL, + `personIdentifier` varchar(128) NOT NULL, + `firstRetrievalDate` datetime DEFAULT NULL, + `retrievalStrategy` varchar(64) DEFAULT NULL, + `source` varchar(32) DEFAULT NULL, + PRIMARY KEY (`pmid`, `personIdentifier`), + KEY `idx_personIdentifier` (`personIdentifier`) USING BTREE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + -- ============================================================================ -- Journal Tables -- ============================================================================ diff --git a/update/retrieveArticleProvenance.py b/update/retrieveArticleProvenance.py new file mode 100644 index 0000000..ec06254 --- /dev/null +++ b/update/retrieveArticleProvenance.py @@ -0,0 +1,353 @@ +# retrieveArticleProvenance.py +# +# Nightly ETL step that scans the ReCiter DynamoDB `ArticleProvenance` table and +# loads it into the reciterdb `article_provenance` table. Powers the Publication +# Manager "date a publication was first retrieved" display in /curate +# (wcmc-its/ReCiter-Publication-Manager#737); backend half of ReCiterDB#95. +# +# Source table (reciter.service.dynamo.ArticleProvenanceServiceImpl): +# - COMPOSITE key: `uid` (HASH, the personIdentifier/CWID) + `articleId` (RANGE, +# the PMID as a String). One item per (person, article) pair. +# - `frd` = first retrieval date, epoch SECONDS, written with if_not_exists so it +# is immutable once set (the first time that person retrieved that article). +# - `rs` = first retrieval strategy (PM_UI_SEARCH, PM_AUTHOR, ...). +# - `src` = source (PM, CTSC, GS, MAN, MAN_FROM_PM, ...). +# - `ads` = String Set of all strategies seen (not loaded here). +# +# reciterdb target is keyed on (pmid, personIdentifier) -- it mirrors the DynamoDB +# composite key exactly (one row per person+article), so no cross-person collapse +# is performed. frd (epoch seconds, UTC) is converted to a DATETIME on load to +# match the rest of reciterdb; PM formats it for display. +# +# Memory: rows are streamed into the staging table one scan page at a time +# (INSERT IGNORE), so peak RSS is bounded to one page regardless of corpus size. +# INSERT IGNORE also collapses any (pmid, personIdentifier) collision (e.g. a +# case-variant uid under the utf8mb4_unicode_ci PK) rather than aborting the load. +# +# Atomicity: mirrors retrieveNIH.py -- load into a `article_provenance_new` +# staging table, validate it against production, then RENAME-swap. A failure here +# leaves production untouched. run_all.py runs this step as NON-FATAL so a hiccup +# does not block the nightly indexing SP (PM reads this table directly; nothing +# downstream depends on it). + +import os +import sys +import time +import random +import logging +import faulthandler +import signal +from datetime import datetime, timezone + +import boto3 +from botocore.config import Config +from botocore.exceptions import BotoCoreError, ClientError +import pymysql.cursors +import pymysql.err + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler('retrieveArticleProvenance.log', mode='w'), + logging.StreamHandler(sys.stdout), + ], +) +logger = logging.getLogger(__name__) + +faulthandler.enable(file=sys.stderr, all_threads=True) +faulthandler.register(signal.SIGUSR1, file=sys.stderr, all_threads=True) + +DYNAMO_TABLE = 'ArticleProvenance' +TARGET_TABLE = 'article_provenance' +STAGING_TABLE = 'article_provenance_new' +BACKUP_TABLE = 'article_provenance_backup' +SCAN_PAGE_SIZE = 1000 + +# Validation floor: reject a partial/empty scan that would otherwise seed or +# replace production with too little data (matches retrieveNIH's min_rows). +MIN_STAGING_ROWS = 100 +# Warn if more than this fraction of scanned items are skipped (data-quality signal). +SKIP_RATIO_WARN = 0.10 + +# Plausible bounds for frd (epoch seconds). Anything outside is treated as corrupt +# and stored NULL rather than a bogus DATETIME. Lower bound = 2000-01-01 UTC. +MIN_EPOCH_SECONDS = 946684800 + +# DDL kept in sync with setup/createDatabaseTableReciterDb.sql and +# setup/alter_add_article_provenance_v1.6.sql. Created defensively so a fresh +# environment that has not yet run the migration still works. +CREATE_TARGET_SQL = f""" +CREATE TABLE IF NOT EXISTS `{TARGET_TABLE}` ( + `pmid` int(11) NOT NULL, + `personIdentifier` varchar(128) NOT NULL, + `firstRetrievalDate` datetime DEFAULT NULL, + `retrievalStrategy` varchar(64) DEFAULT NULL, + `source` varchar(32) DEFAULT NULL, + PRIMARY KEY (`pmid`, `personIdentifier`), + KEY `idx_personIdentifier` (`personIdentifier`) USING BTREE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci +""" + +LOAD_COLUMNS = ['pmid', 'personIdentifier', 'firstRetrievalDate', + 'retrievalStrategy', 'source'] + + +def connect_db(max_retries=5, backoff_factor=1): + username = os.environ['DB_USERNAME'] + password = os.environ['DB_PASSWORD'] + hostname = os.environ['DB_HOST'] + database = os.environ['DB_NAME'] + for retry in range(max_retries): + try: + conn = pymysql.connect( + user=username, + password=password, + database=database, + host=hostname, + charset='utf8mb4', + cursorclass=pymysql.cursors.DictCursor, + ) + logger.info('Connected to database %s on %s', database, hostname) + return conn + except pymysql.err.MySQLError as err: + logger.error('DB connect attempt %d failed: %s', retry + 1, err) + time.sleep(backoff_factor * (2 ** retry) + random.uniform(0, 1)) + raise RuntimeError('Could not connect to database after retries') + + +def epoch_to_datetime_str(frd): + """Convert an epoch-seconds value (DynamoDB Decimal/int/str) to a UTC + 'YYYY-MM-DD HH:MM:SS' string, or None if absent/invalid/out-of-range. frd is + stored as UTC; Publication Manager formats it for display.""" + if frd is None: + return None + try: + secs = int(frd) + except (TypeError, ValueError): + logger.warning('Unparseable frd value: %r; storing NULL', frd) + return None + # Reject implausible timestamps (corrupt data) rather than store a bogus year. + upper = int(datetime.now(tz=timezone.utc).timestamp()) + 86400 # now + 1 day skew + if secs < MIN_EPOCH_SECONDS or secs > upper: + logger.warning('Out-of-range frd value: %r; storing NULL', frd) + return None + try: + return datetime.fromtimestamp(secs, tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S') + except (OverflowError, OSError, ValueError): + logger.warning('Unconvertible frd value: %r; storing NULL', frd) + return None + + +def scan_article_provenance(dynamo_table): + """Generator that yields pages (lists of items) from a full scan of the + ArticleProvenance table. Eventually-consistent read is fine for a nightly + snapshot (frd is immutable once written).""" + total = 0 + last_key = None + while True: + kwargs = {'Limit': SCAN_PAGE_SIZE} + if last_key: + kwargs['ExclusiveStartKey'] = last_key + response = dynamo_table.scan(**kwargs) + items = response.get('Items', []) + total += len(items) + if items: + logger.info('Scanned %d items from %s (running total: %d).', + len(items), DYNAMO_TABLE, total) + yield items + last_key = response.get('LastEvaluatedKey') + if not last_key: + break + logger.info('Finished scanning %s. Total items: %d.', DYNAMO_TABLE, total) + + +def item_to_row(item): + """Map one ArticleProvenance item to a row tuple matching LOAD_COLUMNS, or + return None to skip (missing uid / non-numeric articleId).""" + uid = item.get('uid') + if not uid: + return None + try: + pmid = int(item.get('articleId')) + except (TypeError, ValueError): + return False # distinguishes bad-pmid from no-uid for counting + first_retrieval = epoch_to_datetime_str(item.get('frd')) + rs = item.get('rs') + src = item.get('src') + rs = str(rs)[:64] if rs is not None else None + src = str(src)[:32] if src is not None else None + return (pmid, str(uid)[:128], first_retrieval, rs, src) + + +def stream_into_staging(conn, cursor, dynamo_table): + """Scan ArticleProvenance and INSERT IGNORE each page into the staging table. + Streaming keeps peak memory to one page; INSERT IGNORE collapses any + (pmid, personIdentifier) duplicate (incl. case-variant uids under the + case-insensitive PK) instead of aborting. Returns scan/skip stats.""" + col_list = ', '.join(f'`{c}`' for c in LOAD_COLUMNS) + placeholders = ', '.join(['%s'] * len(LOAD_COLUMNS)) + sql = f"INSERT IGNORE INTO `{STAGING_TABLE}` ({col_list}) VALUES ({placeholders})" + + scanned = skipped_no_uid = skipped_bad_pmid = 0 + for page in scan_article_provenance(dynamo_table): + scanned += len(page) + page_rows = [] + for item in page: + row = item_to_row(item) + if row is None: + skipped_no_uid += 1 + elif row is False: + skipped_bad_pmid += 1 + else: + page_rows.append(row) + if page_rows: + cursor.executemany(sql, page_rows) + conn.commit() + + cursor.execute(f"SELECT COUNT(*) AS c FROM `{STAGING_TABLE}`") + staged = cursor.fetchone()['c'] + skipped = skipped_no_uid + skipped_bad_pmid + logger.info('Scanned %d items; staged %d rows (skipped %d no-uid, %d bad-pmid).', + scanned, staged, skipped_no_uid, skipped_bad_pmid) + if scanned and (skipped / scanned) > SKIP_RATIO_WARN: + logger.warning('High skip ratio: %d/%d (%.1f%%) of scanned items were ' + 'dropped; staged table may be partial.', + skipped, scanned, 100.0 * skipped / scanned) + return {'scanned': scanned, 'staged': staged} + + +def create_staging_table(cursor): + """(Re)create the staging table as an empty clone of production.""" + cursor.execute(CREATE_TARGET_SQL) # ensure production exists (fresh env) + cursor.execute(f"DROP TABLE IF EXISTS `{STAGING_TABLE}`") + cursor.execute(f"CREATE TABLE `{STAGING_TABLE}` LIKE `{TARGET_TABLE}`") + logger.info('Created staging table %s', STAGING_TABLE) + + +def recover_orphaned_backup(conn, cursor): + """Self-heal from a prior run that died after RENAMEing production away but + before the swap completed: if production is gone but a backup exists, restore + it so we never fabricate an empty production table over a good backup.""" + cursor.execute(f"SHOW TABLES LIKE '{TARGET_TABLE}'") + if cursor.fetchone(): + return + cursor.execute(f"SHOW TABLES LIKE '{BACKUP_TABLE}'") + if cursor.fetchone(): + logger.warning('Production %s missing but %s present (orphaned prior run); ' + 'restoring backup before proceeding.', TARGET_TABLE, BACKUP_TABLE) + cursor.execute(f"RENAME TABLE `{BACKUP_TABLE}` TO `{TARGET_TABLE}`") + conn.commit() + + +def validate_staging(cursor, min_rows=MIN_STAGING_ROWS, min_percentage=80): + """Guard against replacing a healthy production table with a partial/empty + scan. Requires the staging table to meet a row floor and (when production + already has data) to be at least min_percentage of the production row count.""" + cursor.execute(f"SELECT COUNT(*) AS c FROM `{STAGING_TABLE}`") + staging_count = cursor.fetchone()['c'] + cursor.execute(f"SELECT COUNT(*) AS c FROM `{TARGET_TABLE}`") + production_count = cursor.fetchone()['c'] + logger.info('Validation: %s has %d rows, %s has %d rows', + STAGING_TABLE, staging_count, TARGET_TABLE, production_count) + + if staging_count < min_rows: + logger.error('Validation FAILED: %s has %d rows (minimum %d)', + STAGING_TABLE, staging_count, min_rows) + return False + if production_count > 0: + percentage = (staging_count / production_count) * 100 + logger.info('Staging is %.1f%% of production', percentage) + if percentage < min_percentage: + logger.error('Validation FAILED: staging (%d) is only %.1f%% of ' + 'production (%d); minimum %d%%', + staging_count, percentage, production_count, min_percentage) + return False + logger.info('Validation PASSED for %s', STAGING_TABLE) + return True + + +def atomic_swap(conn, cursor): + """Atomically swap staging into production: production -> backup, staging -> + production, in a single RENAME TABLE (atomic in MariaDB/InnoDB).""" + cursor.execute(f"DROP TABLE IF EXISTS `{BACKUP_TABLE}`") + rename_sql = (f"RENAME TABLE `{TARGET_TABLE}` TO `{BACKUP_TABLE}`, " + f"`{STAGING_TABLE}` TO `{TARGET_TABLE}`") + logger.info('Executing atomic swap: %s', rename_sql) + cursor.execute(rename_sql) + conn.commit() + logger.info('Atomic swap completed for %s', TARGET_TABLE) + + +def restore_from_backup(conn, cursor): + """Rename the backup table back to production if a swap failed mid-flight. + Because the swap is a single atomic RENAME, a raised swap means NEITHER table + was renamed and the backup was already dropped -- so 'no backup found' here is + the expected, SAFE outcome (production was never moved), not a data-loss event.""" + cursor.execute(f"SHOW TABLES LIKE '{BACKUP_TABLE}'") + if not cursor.fetchone(): + logger.info('No backup table %s to restore (production left untouched).', + BACKUP_TABLE) + return + cursor.execute(f"SHOW TABLES LIKE '{TARGET_TABLE}'") + if cursor.fetchone(): + cursor.execute(f"DROP TABLE IF EXISTS `{TARGET_TABLE}`") + cursor.execute(f"RENAME TABLE `{BACKUP_TABLE}` TO `{TARGET_TABLE}`") + conn.commit() + logger.info('Restored %s from backup', TARGET_TABLE) + + +def cleanup_staging(conn, cursor): + cursor.execute(f"DROP TABLE IF EXISTS `{STAGING_TABLE}`") + conn.commit() + logger.info('Cleaned up staging table %s', STAGING_TABLE) + + +def main(): + conn = connect_db() + cursor = conn.cursor() + + cfg = Config(retries={'max_attempts': 10, 'mode': 'standard'}) + dynamo_table = boto3.resource('dynamodb', config=cfg).Table(DYNAMO_TABLE) + + try: + recover_orphaned_backup(conn, cursor) + create_staging_table(cursor) + conn.commit() + + stream_into_staging(conn, cursor, dynamo_table) + + if not validate_staging(cursor): + logger.error('Validation failed; aborting swap to protect production.') + cleanup_staging(conn, cursor) + sys.exit(1) + + try: + atomic_swap(conn, cursor) + except Exception as swap_err: + logger.error('Atomic swap failed: %s; attempting restore.', swap_err) + restore_from_backup(conn, cursor) + cleanup_staging(conn, cursor) + sys.exit(1) + + logger.info('SUCCESS: %s updated with zero downtime.', TARGET_TABLE) + + except (BotoCoreError, ClientError) as e: + logger.error('DynamoDB error during %s scan: %s', DYNAMO_TABLE, e) + cleanup_staging(conn, cursor) + sys.exit(1) + except pymysql.err.MySQLError as e: + logger.error('Database error: %s', e) + try: + cleanup_staging(conn, cursor) + except Exception: + pass + sys.exit(1) + finally: + cursor.close() + conn.close() + logger.info('Database connection closed.') + + +if __name__ == '__main__': + main() diff --git a/update/run_all.py b/update/run_all.py index 6ebe245..61713a1 100644 --- a/update/run_all.py +++ b/update/run_all.py @@ -109,22 +109,30 @@ def upload_log_to_s3(): # ------------- Main Flow ------------- def main(): + # (name, command, non_fatal). non_fatal=True steps log a warning and continue + # on failure instead of aborting the pipeline. scripts = [ - ("executeFeatureGenerator", "python3 executeFeatureGenerator.py"), - ("retrieveArticles", "python3 retrieveArticles.py"), - ("retrieveNIH", "python3 retrieveNIH.py"), - ("retrieveReporter", "python3 retrieveReporter.py"), - ("nightlyIndexing", "bash run_nightly_indexing.sh"), - ("abstractImport", "python3 abstractImport.py"), - ("conflictsImport", "python3 conflictsImport.py") + ("executeFeatureGenerator", "python3 executeFeatureGenerator.py", False), + ("retrieveArticles", "python3 retrieveArticles.py", False), + ("retrieveNIH", "python3 retrieveNIH.py", False), + ("retrieveReporter", "python3 retrieveReporter.py", False), + # article_provenance feeds a PM display field only; nothing downstream + # depends on it, so a failure here must not block nightly indexing. + ("retrieveArticleProvenance", "python3 retrieveArticleProvenance.py", True), + ("nightlyIndexing", "bash run_nightly_indexing.sh", False), + ("abstractImport", "python3 abstractImport.py", False), + ("conflictsImport", "python3 conflictsImport.py", False) ] overall_success = True - for name, cmd in scripts: + for name, cmd, non_fatal in scripts: #ok = run_script(name, cmd) ok = run_script(name, cmd, timeout_seconds=int(os.getenv("SCRIPT_TIMEOUT_SECONDS", "15000"))) if not ok: + if non_fatal: + logger.warning(f"⚠️ NON-FATAL: {name} failed; continuing pipeline.") + continue overall_success = False logger.error("Stopping pipeline due to script failure.") break From fca972a755c63e494bf24b7822bb2864fc8d5d27 Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Sun, 14 Jun 2026 13:26:42 -0400 Subject: [PATCH 18/19] Revert "Merge pull request #96 from wcmc-its/feature/article-provenance-etl" This reverts commit acd26bb561adfa6ec69de95c312aea24f43d6d76, reversing changes made to ffd06311d22961791b40aa0e1c094566c1552e3a. --- Dockerfile | 1 - README.md | 13 +- setup/alter_add_article_provenance_v1.6.sql | 56 ---- setup/createDatabaseTableReciterDb.sql | 17 - update/retrieveArticleProvenance.py | 353 -------------------- update/run_all.py | 24 +- 6 files changed, 11 insertions(+), 453 deletions(-) delete mode 100644 setup/alter_add_article_provenance_v1.6.sql delete mode 100644 update/retrieveArticleProvenance.py diff --git a/Dockerfile b/Dockerfile index a388a25..f4f344b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,6 @@ ENV PYTHONUNBUFFERED=1 # Copy additional Python scripts COPY update/retrieveNIH.py ./ COPY update/retrieveReporter.py ./ -COPY update/retrieveArticleProvenance.py ./ COPY update/retrieveAltmetric.py ./ COPY update/retrieveArticles.py ./ COPY update/updateReciterDB.py ./ diff --git a/README.md b/README.md index dc22384..c39a9e1 100644 --- a/README.md +++ b/README.md @@ -69,15 +69,12 @@ CronJob (daily) ├─ 1. executeFeatureGenerator.py Trigger ReCiter ML scoring API ├─ 2. retrieveArticles.py Pull person/article data from S3 + DynamoDB ├─ 3. retrieveNIH.py NIH iCite API → analysis_nih (atomic swap) - ├─ 4. retrieveReporter.py NIH RePORTER grants → grant_reporter_* (reconcile) - ├─ 5. retrieveArticleProvenance.py ArticleProvenance (DynamoDB) → article_provenance - │ └─ non-fatal: a failure here does not block nightly indexing - ├─ 6. run_nightly_indexing.sh Run populateAnalysisSummaryTables_v2() + ├─ 4. run_nightly_indexing.sh Run populateAnalysisSummaryTables_v2() │ ├─ Polls analysis_job_log every 3s for progress │ ├─ Auto-retries 3x with 60s backoff │ └─ Auto-restores from backup on failure - ├─ 7. abstractImport.py PubMed abstracts from DynamoDB - └─ 8. conflictsImport.py COI statements from DynamoDB + ├─ 5. abstractImport.py PubMed abstracts from DynamoDB + └─ 6. conflictsImport.py COI statements from DynamoDB ``` **Key patterns:** @@ -108,8 +105,6 @@ ReCiterDB/ │ ├── run_nightly_indexing.sh # SP runner with monitoring/retry │ ├── retrieveArticles.py # S3 + DynamoDB article fetcher │ ├── retrieveNIH.py # NIH iCite fetcher (atomic swap) -│ ├── retrieveReporter.py # NIH RePORTER grants fetcher (reconcile) -│ ├── retrieveArticleProvenance.py # ArticleProvenance (DynamoDB) → article_provenance │ ├── retrieveAltmetric.py # Altmetric API fetcher │ ├── updateReciterDB.py # Bulk loader (LOAD DATA LOCAL INFILE) │ ├── dataTransformer.py # ReCiter JSON → CSV @@ -293,8 +288,6 @@ All defined in `setup/createEventsProceduresReciterDb.sql`. | `run_all.py` | EKS orchestrator: runs all pipeline steps in sequence with timeout enforcement, memory logging, and S3 log upload | | `retrieveArticles.py` | Fetches person and article data from S3 and DynamoDB in batches | | `retrieveNIH.py` | Fetches NIH iCite metrics in batches of 150; loads to staging table with validation and atomic swap | -| `retrieveReporter.py` | Fetches NIH RePORTER grants/linkages and reconciles them into `grant_reporter_*` / `grant_provenance` | -| `retrieveArticleProvenance.py` | Scans DynamoDB `ArticleProvenance` → `article_provenance` (pmid, personIdentifier) via staging + atomic swap; runs non-fatal | | `retrieveAltmetric.py` | Fetches Altmetric scores for articles published in the last 2 years | | `updateReciterDB.py` | Bulk data loader using `LOAD DATA LOCAL INFILE` with retry and reconnect logic | | `dataTransformer.py` | Transforms ReCiter JSON output to CSV format for all `person_*` tables | diff --git a/setup/alter_add_article_provenance_v1.6.sql b/setup/alter_add_article_provenance_v1.6.sql deleted file mode 100644 index 206ca9f..0000000 --- a/setup/alter_add_article_provenance_v1.6.sql +++ /dev/null @@ -1,56 +0,0 @@ --- ============================================================================= --- Migration: Add article_provenance table (v1.6) --- ============================================================================= --- Creates the article_provenance table on EXISTING databases (dev, prod). Fresh --- builds already get it from setup/createDatabaseTableReciterDb.sql (ReCiterDB#95). --- --- WHAT IT IS: --- First-retrieval provenance per (article, person), loaded nightly by --- update/retrieveArticleProvenance.py from the ReCiter DynamoDB --- `ArticleProvenance` table. That source table has a COMPOSITE key --- uid (HASH, personIdentifier) + articleId (RANGE, PMID), so there is one item --- per person+article. This table mirrors that key exactly -- PRIMARY KEY --- (pmid, personIdentifier) -- rather than collapsing to one row per PMID. --- --- Columns map from DynamoDB attributes: --- pmid <- articleId (String PMID -> INT) --- personIdentifier <- uid --- firstRetrievalDate <- frd (epoch SECONDS, UTC -> DATETIME) --- retrievalStrategy <- rs (PM_UI_SEARCH, PM_AUTHOR, ...) --- source <- src (PM, CTSC, GS, MAN, MAN_FROM_PM, ...) --- --- WHY THIS MIGRATION EXISTS: --- Publication Manager #737 displays "date a publication was first retrieved" --- in /curate and reads this table by PMID. The table must exist before the PM --- #737 branch is deployed against this database. --- --- DURABILITY / ETL CONTRACT: --- Loaded via a staging table (article_provenance_new) + atomic RENAME swap by --- the nightly ETL, exactly like analysis_nih. It is NOT in --- update/updateReciterDB.py's truncate list and is not touched by any nightly --- stored procedure. A failure in the ETL step leaves production untouched and --- does not block the rest of run_all.py (the step runs as non-fatal). --- --- Safe to run on prod and dev. CREATE TABLE IF NOT EXISTS is a no-op on re-run --- and additive only -- no existing table or row is modified. --- ============================================================================= - -CREATE TABLE IF NOT EXISTS `article_provenance` ( - `pmid` int(11) NOT NULL, - `personIdentifier` varchar(128) NOT NULL, - `firstRetrievalDate` datetime DEFAULT NULL, - `retrievalStrategy` varchar(64) DEFAULT NULL, - `source` varchar(32) DEFAULT NULL, - PRIMARY KEY (`pmid`, `personIdentifier`), - KEY `idx_personIdentifier` (`personIdentifier`) USING BTREE -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; - --- ----------------------------------------------------------------------------- --- Verification --- ----------------------------------------------------------------------------- - -SELECT table_name, column_name, data_type, character_maximum_length, is_nullable, column_key -FROM information_schema.columns -WHERE table_schema = DATABASE() - AND table_name = 'article_provenance' -ORDER BY ordinal_position; diff --git a/setup/createDatabaseTableReciterDb.sql b/setup/createDatabaseTableReciterDb.sql index d5127fe..91c8641 100644 --- a/setup/createDatabaseTableReciterDb.sql +++ b/setup/createDatabaseTableReciterDb.sql @@ -492,23 +492,6 @@ CREATE TABLE IF NOT EXISTS `analysis_temp_output_table_cell` ( KEY `personIdentifier` (`personIdentifier`) USING BTREE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; --- article_provenance: first-retrieval provenance per (article, person), loaded --- nightly by update/retrieveArticleProvenance.py from the ReCiter DynamoDB --- `ArticleProvenance` table (composite key uid + articleId). Keyed on --- (pmid, personIdentifier) to mirror that source key exactly -- one row per --- person+article. firstRetrievalDate is `frd` (epoch seconds, UTC) converted to --- DATETIME. Loaded via staging->atomic-swap; NOT in any truncate list. --- Consumed by Publication Manager #737 ("date a publication was first retrieved"). -CREATE TABLE IF NOT EXISTS `article_provenance` ( - `pmid` int(11) NOT NULL, - `personIdentifier` varchar(128) NOT NULL, - `firstRetrievalDate` datetime DEFAULT NULL, - `retrievalStrategy` varchar(64) DEFAULT NULL, - `source` varchar(32) DEFAULT NULL, - PRIMARY KEY (`pmid`, `personIdentifier`), - KEY `idx_personIdentifier` (`personIdentifier`) USING BTREE -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; - -- ============================================================================ -- Journal Tables -- ============================================================================ diff --git a/update/retrieveArticleProvenance.py b/update/retrieveArticleProvenance.py deleted file mode 100644 index ec06254..0000000 --- a/update/retrieveArticleProvenance.py +++ /dev/null @@ -1,353 +0,0 @@ -# retrieveArticleProvenance.py -# -# Nightly ETL step that scans the ReCiter DynamoDB `ArticleProvenance` table and -# loads it into the reciterdb `article_provenance` table. Powers the Publication -# Manager "date a publication was first retrieved" display in /curate -# (wcmc-its/ReCiter-Publication-Manager#737); backend half of ReCiterDB#95. -# -# Source table (reciter.service.dynamo.ArticleProvenanceServiceImpl): -# - COMPOSITE key: `uid` (HASH, the personIdentifier/CWID) + `articleId` (RANGE, -# the PMID as a String). One item per (person, article) pair. -# - `frd` = first retrieval date, epoch SECONDS, written with if_not_exists so it -# is immutable once set (the first time that person retrieved that article). -# - `rs` = first retrieval strategy (PM_UI_SEARCH, PM_AUTHOR, ...). -# - `src` = source (PM, CTSC, GS, MAN, MAN_FROM_PM, ...). -# - `ads` = String Set of all strategies seen (not loaded here). -# -# reciterdb target is keyed on (pmid, personIdentifier) -- it mirrors the DynamoDB -# composite key exactly (one row per person+article), so no cross-person collapse -# is performed. frd (epoch seconds, UTC) is converted to a DATETIME on load to -# match the rest of reciterdb; PM formats it for display. -# -# Memory: rows are streamed into the staging table one scan page at a time -# (INSERT IGNORE), so peak RSS is bounded to one page regardless of corpus size. -# INSERT IGNORE also collapses any (pmid, personIdentifier) collision (e.g. a -# case-variant uid under the utf8mb4_unicode_ci PK) rather than aborting the load. -# -# Atomicity: mirrors retrieveNIH.py -- load into a `article_provenance_new` -# staging table, validate it against production, then RENAME-swap. A failure here -# leaves production untouched. run_all.py runs this step as NON-FATAL so a hiccup -# does not block the nightly indexing SP (PM reads this table directly; nothing -# downstream depends on it). - -import os -import sys -import time -import random -import logging -import faulthandler -import signal -from datetime import datetime, timezone - -import boto3 -from botocore.config import Config -from botocore.exceptions import BotoCoreError, ClientError -import pymysql.cursors -import pymysql.err - -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(levelname)s - %(message)s', - handlers=[ - logging.FileHandler('retrieveArticleProvenance.log', mode='w'), - logging.StreamHandler(sys.stdout), - ], -) -logger = logging.getLogger(__name__) - -faulthandler.enable(file=sys.stderr, all_threads=True) -faulthandler.register(signal.SIGUSR1, file=sys.stderr, all_threads=True) - -DYNAMO_TABLE = 'ArticleProvenance' -TARGET_TABLE = 'article_provenance' -STAGING_TABLE = 'article_provenance_new' -BACKUP_TABLE = 'article_provenance_backup' -SCAN_PAGE_SIZE = 1000 - -# Validation floor: reject a partial/empty scan that would otherwise seed or -# replace production with too little data (matches retrieveNIH's min_rows). -MIN_STAGING_ROWS = 100 -# Warn if more than this fraction of scanned items are skipped (data-quality signal). -SKIP_RATIO_WARN = 0.10 - -# Plausible bounds for frd (epoch seconds). Anything outside is treated as corrupt -# and stored NULL rather than a bogus DATETIME. Lower bound = 2000-01-01 UTC. -MIN_EPOCH_SECONDS = 946684800 - -# DDL kept in sync with setup/createDatabaseTableReciterDb.sql and -# setup/alter_add_article_provenance_v1.6.sql. Created defensively so a fresh -# environment that has not yet run the migration still works. -CREATE_TARGET_SQL = f""" -CREATE TABLE IF NOT EXISTS `{TARGET_TABLE}` ( - `pmid` int(11) NOT NULL, - `personIdentifier` varchar(128) NOT NULL, - `firstRetrievalDate` datetime DEFAULT NULL, - `retrievalStrategy` varchar(64) DEFAULT NULL, - `source` varchar(32) DEFAULT NULL, - PRIMARY KEY (`pmid`, `personIdentifier`), - KEY `idx_personIdentifier` (`personIdentifier`) USING BTREE -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci -""" - -LOAD_COLUMNS = ['pmid', 'personIdentifier', 'firstRetrievalDate', - 'retrievalStrategy', 'source'] - - -def connect_db(max_retries=5, backoff_factor=1): - username = os.environ['DB_USERNAME'] - password = os.environ['DB_PASSWORD'] - hostname = os.environ['DB_HOST'] - database = os.environ['DB_NAME'] - for retry in range(max_retries): - try: - conn = pymysql.connect( - user=username, - password=password, - database=database, - host=hostname, - charset='utf8mb4', - cursorclass=pymysql.cursors.DictCursor, - ) - logger.info('Connected to database %s on %s', database, hostname) - return conn - except pymysql.err.MySQLError as err: - logger.error('DB connect attempt %d failed: %s', retry + 1, err) - time.sleep(backoff_factor * (2 ** retry) + random.uniform(0, 1)) - raise RuntimeError('Could not connect to database after retries') - - -def epoch_to_datetime_str(frd): - """Convert an epoch-seconds value (DynamoDB Decimal/int/str) to a UTC - 'YYYY-MM-DD HH:MM:SS' string, or None if absent/invalid/out-of-range. frd is - stored as UTC; Publication Manager formats it for display.""" - if frd is None: - return None - try: - secs = int(frd) - except (TypeError, ValueError): - logger.warning('Unparseable frd value: %r; storing NULL', frd) - return None - # Reject implausible timestamps (corrupt data) rather than store a bogus year. - upper = int(datetime.now(tz=timezone.utc).timestamp()) + 86400 # now + 1 day skew - if secs < MIN_EPOCH_SECONDS or secs > upper: - logger.warning('Out-of-range frd value: %r; storing NULL', frd) - return None - try: - return datetime.fromtimestamp(secs, tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S') - except (OverflowError, OSError, ValueError): - logger.warning('Unconvertible frd value: %r; storing NULL', frd) - return None - - -def scan_article_provenance(dynamo_table): - """Generator that yields pages (lists of items) from a full scan of the - ArticleProvenance table. Eventually-consistent read is fine for a nightly - snapshot (frd is immutable once written).""" - total = 0 - last_key = None - while True: - kwargs = {'Limit': SCAN_PAGE_SIZE} - if last_key: - kwargs['ExclusiveStartKey'] = last_key - response = dynamo_table.scan(**kwargs) - items = response.get('Items', []) - total += len(items) - if items: - logger.info('Scanned %d items from %s (running total: %d).', - len(items), DYNAMO_TABLE, total) - yield items - last_key = response.get('LastEvaluatedKey') - if not last_key: - break - logger.info('Finished scanning %s. Total items: %d.', DYNAMO_TABLE, total) - - -def item_to_row(item): - """Map one ArticleProvenance item to a row tuple matching LOAD_COLUMNS, or - return None to skip (missing uid / non-numeric articleId).""" - uid = item.get('uid') - if not uid: - return None - try: - pmid = int(item.get('articleId')) - except (TypeError, ValueError): - return False # distinguishes bad-pmid from no-uid for counting - first_retrieval = epoch_to_datetime_str(item.get('frd')) - rs = item.get('rs') - src = item.get('src') - rs = str(rs)[:64] if rs is not None else None - src = str(src)[:32] if src is not None else None - return (pmid, str(uid)[:128], first_retrieval, rs, src) - - -def stream_into_staging(conn, cursor, dynamo_table): - """Scan ArticleProvenance and INSERT IGNORE each page into the staging table. - Streaming keeps peak memory to one page; INSERT IGNORE collapses any - (pmid, personIdentifier) duplicate (incl. case-variant uids under the - case-insensitive PK) instead of aborting. Returns scan/skip stats.""" - col_list = ', '.join(f'`{c}`' for c in LOAD_COLUMNS) - placeholders = ', '.join(['%s'] * len(LOAD_COLUMNS)) - sql = f"INSERT IGNORE INTO `{STAGING_TABLE}` ({col_list}) VALUES ({placeholders})" - - scanned = skipped_no_uid = skipped_bad_pmid = 0 - for page in scan_article_provenance(dynamo_table): - scanned += len(page) - page_rows = [] - for item in page: - row = item_to_row(item) - if row is None: - skipped_no_uid += 1 - elif row is False: - skipped_bad_pmid += 1 - else: - page_rows.append(row) - if page_rows: - cursor.executemany(sql, page_rows) - conn.commit() - - cursor.execute(f"SELECT COUNT(*) AS c FROM `{STAGING_TABLE}`") - staged = cursor.fetchone()['c'] - skipped = skipped_no_uid + skipped_bad_pmid - logger.info('Scanned %d items; staged %d rows (skipped %d no-uid, %d bad-pmid).', - scanned, staged, skipped_no_uid, skipped_bad_pmid) - if scanned and (skipped / scanned) > SKIP_RATIO_WARN: - logger.warning('High skip ratio: %d/%d (%.1f%%) of scanned items were ' - 'dropped; staged table may be partial.', - skipped, scanned, 100.0 * skipped / scanned) - return {'scanned': scanned, 'staged': staged} - - -def create_staging_table(cursor): - """(Re)create the staging table as an empty clone of production.""" - cursor.execute(CREATE_TARGET_SQL) # ensure production exists (fresh env) - cursor.execute(f"DROP TABLE IF EXISTS `{STAGING_TABLE}`") - cursor.execute(f"CREATE TABLE `{STAGING_TABLE}` LIKE `{TARGET_TABLE}`") - logger.info('Created staging table %s', STAGING_TABLE) - - -def recover_orphaned_backup(conn, cursor): - """Self-heal from a prior run that died after RENAMEing production away but - before the swap completed: if production is gone but a backup exists, restore - it so we never fabricate an empty production table over a good backup.""" - cursor.execute(f"SHOW TABLES LIKE '{TARGET_TABLE}'") - if cursor.fetchone(): - return - cursor.execute(f"SHOW TABLES LIKE '{BACKUP_TABLE}'") - if cursor.fetchone(): - logger.warning('Production %s missing but %s present (orphaned prior run); ' - 'restoring backup before proceeding.', TARGET_TABLE, BACKUP_TABLE) - cursor.execute(f"RENAME TABLE `{BACKUP_TABLE}` TO `{TARGET_TABLE}`") - conn.commit() - - -def validate_staging(cursor, min_rows=MIN_STAGING_ROWS, min_percentage=80): - """Guard against replacing a healthy production table with a partial/empty - scan. Requires the staging table to meet a row floor and (when production - already has data) to be at least min_percentage of the production row count.""" - cursor.execute(f"SELECT COUNT(*) AS c FROM `{STAGING_TABLE}`") - staging_count = cursor.fetchone()['c'] - cursor.execute(f"SELECT COUNT(*) AS c FROM `{TARGET_TABLE}`") - production_count = cursor.fetchone()['c'] - logger.info('Validation: %s has %d rows, %s has %d rows', - STAGING_TABLE, staging_count, TARGET_TABLE, production_count) - - if staging_count < min_rows: - logger.error('Validation FAILED: %s has %d rows (minimum %d)', - STAGING_TABLE, staging_count, min_rows) - return False - if production_count > 0: - percentage = (staging_count / production_count) * 100 - logger.info('Staging is %.1f%% of production', percentage) - if percentage < min_percentage: - logger.error('Validation FAILED: staging (%d) is only %.1f%% of ' - 'production (%d); minimum %d%%', - staging_count, percentage, production_count, min_percentage) - return False - logger.info('Validation PASSED for %s', STAGING_TABLE) - return True - - -def atomic_swap(conn, cursor): - """Atomically swap staging into production: production -> backup, staging -> - production, in a single RENAME TABLE (atomic in MariaDB/InnoDB).""" - cursor.execute(f"DROP TABLE IF EXISTS `{BACKUP_TABLE}`") - rename_sql = (f"RENAME TABLE `{TARGET_TABLE}` TO `{BACKUP_TABLE}`, " - f"`{STAGING_TABLE}` TO `{TARGET_TABLE}`") - logger.info('Executing atomic swap: %s', rename_sql) - cursor.execute(rename_sql) - conn.commit() - logger.info('Atomic swap completed for %s', TARGET_TABLE) - - -def restore_from_backup(conn, cursor): - """Rename the backup table back to production if a swap failed mid-flight. - Because the swap is a single atomic RENAME, a raised swap means NEITHER table - was renamed and the backup was already dropped -- so 'no backup found' here is - the expected, SAFE outcome (production was never moved), not a data-loss event.""" - cursor.execute(f"SHOW TABLES LIKE '{BACKUP_TABLE}'") - if not cursor.fetchone(): - logger.info('No backup table %s to restore (production left untouched).', - BACKUP_TABLE) - return - cursor.execute(f"SHOW TABLES LIKE '{TARGET_TABLE}'") - if cursor.fetchone(): - cursor.execute(f"DROP TABLE IF EXISTS `{TARGET_TABLE}`") - cursor.execute(f"RENAME TABLE `{BACKUP_TABLE}` TO `{TARGET_TABLE}`") - conn.commit() - logger.info('Restored %s from backup', TARGET_TABLE) - - -def cleanup_staging(conn, cursor): - cursor.execute(f"DROP TABLE IF EXISTS `{STAGING_TABLE}`") - conn.commit() - logger.info('Cleaned up staging table %s', STAGING_TABLE) - - -def main(): - conn = connect_db() - cursor = conn.cursor() - - cfg = Config(retries={'max_attempts': 10, 'mode': 'standard'}) - dynamo_table = boto3.resource('dynamodb', config=cfg).Table(DYNAMO_TABLE) - - try: - recover_orphaned_backup(conn, cursor) - create_staging_table(cursor) - conn.commit() - - stream_into_staging(conn, cursor, dynamo_table) - - if not validate_staging(cursor): - logger.error('Validation failed; aborting swap to protect production.') - cleanup_staging(conn, cursor) - sys.exit(1) - - try: - atomic_swap(conn, cursor) - except Exception as swap_err: - logger.error('Atomic swap failed: %s; attempting restore.', swap_err) - restore_from_backup(conn, cursor) - cleanup_staging(conn, cursor) - sys.exit(1) - - logger.info('SUCCESS: %s updated with zero downtime.', TARGET_TABLE) - - except (BotoCoreError, ClientError) as e: - logger.error('DynamoDB error during %s scan: %s', DYNAMO_TABLE, e) - cleanup_staging(conn, cursor) - sys.exit(1) - except pymysql.err.MySQLError as e: - logger.error('Database error: %s', e) - try: - cleanup_staging(conn, cursor) - except Exception: - pass - sys.exit(1) - finally: - cursor.close() - conn.close() - logger.info('Database connection closed.') - - -if __name__ == '__main__': - main() diff --git a/update/run_all.py b/update/run_all.py index 61713a1..6ebe245 100644 --- a/update/run_all.py +++ b/update/run_all.py @@ -109,30 +109,22 @@ def upload_log_to_s3(): # ------------- Main Flow ------------- def main(): - # (name, command, non_fatal). non_fatal=True steps log a warning and continue - # on failure instead of aborting the pipeline. scripts = [ - ("executeFeatureGenerator", "python3 executeFeatureGenerator.py", False), - ("retrieveArticles", "python3 retrieveArticles.py", False), - ("retrieveNIH", "python3 retrieveNIH.py", False), - ("retrieveReporter", "python3 retrieveReporter.py", False), - # article_provenance feeds a PM display field only; nothing downstream - # depends on it, so a failure here must not block nightly indexing. - ("retrieveArticleProvenance", "python3 retrieveArticleProvenance.py", True), - ("nightlyIndexing", "bash run_nightly_indexing.sh", False), - ("abstractImport", "python3 abstractImport.py", False), - ("conflictsImport", "python3 conflictsImport.py", False) + ("executeFeatureGenerator", "python3 executeFeatureGenerator.py"), + ("retrieveArticles", "python3 retrieveArticles.py"), + ("retrieveNIH", "python3 retrieveNIH.py"), + ("retrieveReporter", "python3 retrieveReporter.py"), + ("nightlyIndexing", "bash run_nightly_indexing.sh"), + ("abstractImport", "python3 abstractImport.py"), + ("conflictsImport", "python3 conflictsImport.py") ] overall_success = True - for name, cmd, non_fatal in scripts: + for name, cmd in scripts: #ok = run_script(name, cmd) ok = run_script(name, cmd, timeout_seconds=int(os.getenv("SCRIPT_TIMEOUT_SECONDS", "15000"))) if not ok: - if non_fatal: - logger.warning(f"⚠️ NON-FATAL: {name} failed; continuing pipeline.") - continue overall_success = False logger.error("Stopping pipeline due to script failure.") break From 408c4074b1bd188a720112d779652ffe2bb72cfd Mon Sep 17 00:00:00 2001 From: Paul Albert Date: Wed, 17 Jun 2026 15:58:39 -0400 Subject: [PATCH 19/19] feat(setup): mirror RBAC permission tables + impersonatedByUserID column 3-places mirror of ReCiter-Publication-Manager schema: - add admin_permissions, admin_role_permissions, admin_permission_resources to createDatabaseTableReciterDb.sql - add impersonatedByUserID column to admin_feedback_log - add table_admin_permissions.sql with the permission/role-mapping/nav seed Mirrors PM scripts/migrations/add-permission-tables.sql and add-impersonated-by-feedbacklog.sql. Refs #739, #733. --- setup/createDatabaseTableReciterDb.sql | 44 ++++++++++++++++++++ setup/table_admin_permissions.sql | 56 ++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 setup/table_admin_permissions.sql diff --git a/setup/createDatabaseTableReciterDb.sql b/setup/createDatabaseTableReciterDb.sql index 91c8641..ea3bd8e 100644 --- a/setup/createDatabaseTableReciterDb.sql +++ b/setup/createDatabaseTableReciterDb.sql @@ -48,6 +48,7 @@ CREATE TABLE IF NOT EXISTS `admin_feedback_log` ( `personIdentifier` varchar(20) DEFAULT NULL, `articleIdentifier` int(11) DEFAULT NULL, `feedback` varchar(11) DEFAULT NULL, + `impersonatedByUserID` int(11) DEFAULT NULL, `createTimestamp` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', `modifyTimestamp` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), PRIMARY KEY (`feedbackID`), @@ -96,6 +97,49 @@ CREATE TABLE IF NOT EXISTS `admin_roles` ( PRIMARY KEY (`roleID`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +-- Data-driven RBAC permission tables. +-- Mirror of ReCiter-Publication-Manager scripts/migrations/add-permission-tables.sql +-- (3-places rule). Seed data lives in setup/table_admin_permissions.sql. +CREATE TABLE IF NOT EXISTS `admin_permissions` ( + `permissionID` int(11) NOT NULL AUTO_INCREMENT, + `permissionKey` varchar(128) NOT NULL, + `label` varchar(255) NOT NULL, + `description` text DEFAULT NULL, + `category` varchar(64) NOT NULL, + `createTimestamp` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00', + `modifyTimestamp` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(), + PRIMARY KEY (`permissionID`), + UNIQUE KEY `uq_permissionKey` (`permissionKey`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE IF NOT EXISTS `admin_role_permissions` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `roleID` int(11) NOT NULL, + `permissionID` int(11) NOT NULL, + `createTimestamp` timestamp NOT NULL DEFAULT current_timestamp(), + PRIMARY KEY (`id`), + UNIQUE KEY `uq_role_permission` (`roleID`,`permissionID`), + KEY `idx_roleID` (`roleID`), + KEY `idx_permissionID` (`permissionID`), + CONSTRAINT `fk_rp_role` FOREIGN KEY (`roleID`) REFERENCES `admin_roles` (`roleID`) ON DELETE CASCADE, + CONSTRAINT `fk_rp_permission` FOREIGN KEY (`permissionID`) REFERENCES `admin_permissions` (`permissionID`) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + +CREATE TABLE IF NOT EXISTS `admin_permission_resources` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `permissionID` int(11) NOT NULL, + `resourceType` varchar(32) NOT NULL, + `resourceKey` varchar(128) NOT NULL, + `displayOrder` int(11) NOT NULL DEFAULT 0, + `icon` varchar(64) DEFAULT NULL, + `label` varchar(255) NOT NULL, + `route` varchar(255) DEFAULT NULL, + `createTimestamp` timestamp NOT NULL DEFAULT current_timestamp(), + PRIMARY KEY (`id`), + KEY `idx_pr_permissionID` (`permissionID`), + CONSTRAINT `fk_pr_permission` FOREIGN KEY (`permissionID`) REFERENCES `admin_permissions` (`permissionID`) ON DELETE CASCADE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + CREATE TABLE IF NOT EXISTS `admin_settings` ( `viewName` varchar(200) NOT NULL, `viewAttributes` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_bin DEFAULT NULL CHECK (json_valid(`viewAttributes`)), diff --git a/setup/table_admin_permissions.sql b/setup/table_admin_permissions.sql new file mode 100644 index 0000000..c1d37af --- /dev/null +++ b/setup/table_admin_permissions.sql @@ -0,0 +1,56 @@ +-- Seed data for the data-driven RBAC permission tables. +-- Mirror of the SEED section of ReCiter-Publication-Manager +-- scripts/migrations/add-permission-tables.sql (3-places rule). +-- +-- Run ONCE per environment, after createDatabaseTableReciterDb.sql has created +-- the tables and table_admin_roles.sql has seeded admin_roles. The role->permission +-- seed joins on admin_roles.roleLabel, so it adapts to whatever roles an +-- environment defines; 'Curator_Scoped' is a harmless no-op where that role +-- does not exist. + +-- 1. Permissions (7) +INSERT INTO `admin_permissions` (`permissionKey`, `label`, `description`, `category`) VALUES + ('canCurate', 'Curate Publications', 'Accept or reject article suggestions for people', 'Curation'), + ('canSearch', 'Search Identities', 'Search and browse the identity directory', 'Navigation'), + ('canReport', 'Create Reports', 'Generate publication reports and export data', 'Reporting'), + ('canManageUsers', 'Manage Users', 'Create, edit, and deactivate user accounts and assign roles', 'Administration'), + ('canConfigure', 'Configuration', 'Edit application settings, labels, and field visibility', 'Administration'), + ('canManageNotifications', 'Manage Notifications', 'Configure notification preferences', 'Communication'), + ('canManageProfile', 'Manage Profile', 'View and edit user profile information', 'Profile'); + +-- 2. Role -> permission mappings (reproduces current behavior) +INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`) + SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap + WHERE ar.roleLabel = 'Superuser'; -- all 7 +INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`) + SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap + WHERE ar.roleLabel = 'Curator_All' AND ap.permissionKey IN ('canCurate','canSearch'); +INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`) + SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap + WHERE ar.roleLabel = 'Curator_Self' AND ap.permissionKey IN ('canCurate'); +INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`) + SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap + WHERE ar.roleLabel = 'Curator_Scoped' AND ap.permissionKey IN ('canCurate','canSearch'); +INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`) + SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap + WHERE ar.roleLabel = 'Curator_Department' AND ap.permissionKey IN ('canCurate','canSearch'); +INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`) + SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap + WHERE ar.roleLabel = 'Curator_Department_Delegate' AND ap.permissionKey IN ('canCurate','canSearch'); +INSERT INTO `admin_role_permissions` (`roleID`, `permissionID`) + SELECT ar.roleID, ap.permissionID FROM admin_roles ar CROSS JOIN admin_permissions ap + WHERE ar.roleLabel = 'Reporter_All' AND ap.permissionKey IN ('canReport','canSearch'); + +-- 3. Nav resources (sidebar items) +INSERT INTO `admin_permission_resources` (`permissionID`, `resourceType`, `resourceKey`, `displayOrder`, `icon`, `label`, `route`) + SELECT ap.permissionID, v.resourceType, v.resourceKey, v.displayOrder, v.icon, v.label, v.route + FROM admin_permissions ap + JOIN ( + SELECT 'canSearch' AS pk, 'nav' AS resourceType, 'nav_search' AS resourceKey, 1 AS displayOrder, 'Search' AS icon, 'Find People' AS label, '/search' AS route + UNION ALL SELECT 'canCurate', 'nav', 'nav_curate', 2, 'LocalLibrary', 'Curate Publications', '/curate' + UNION ALL SELECT 'canReport', 'nav', 'nav_report', 3, 'Assessment', 'Create Reports', '/report' + UNION ALL SELECT 'canManageNotifications', 'nav', 'nav_notifications', 4, 'NotificationsActive', 'Manage Notifications', '/notifications' + UNION ALL SELECT 'canManageProfile', 'nav', 'nav_profile', 5, 'AccountCircle', 'Manage Profile', '/manageprofile' + UNION ALL SELECT 'canManageUsers', 'nav', 'nav_users', 6, 'Group', 'Manage Users', '/manageusers' + UNION ALL SELECT 'canConfigure', 'nav', 'nav_config', 7, 'Settings', 'Configuration', '/configuration' + ) v ON ap.permissionKey = v.pk;