Skip to content
Open
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
78 commits
Select commit Hold shift + click to select a range
7bfa850
Script for query
rdurnik Jan 27, 2026
ca87cb4
Script for abstracts download
rdurnik Jan 27, 2026
1bed026
Query clean up
rdurnik Jan 27, 2026
50f8ced
Script for chemical identification
rdurnik Jan 27, 2026
36ed78e
Script for matching of chemical names to database of relevant chemicals
rdurnik Jan 27, 2026
e5fcdc9
Lint
rdurnik Jan 27, 2026
95044b4
Script to normalize chemical
rdurnik Jan 27, 2026
5759105
Updated matching to take list of chemical names (Excel file)
rdurnik Jan 27, 2026
409c8c9
Script to generate MeSH terms dataframe
rdurnik Jan 27, 2026
f3258e2
Script for find relationships
rdurnik Jan 27, 2026
dffc889
IDs are saved as txt file
rdurnik Jan 27, 2026
4aa629b
Clean up
rdurnik Jan 27, 2026
9204380
Added output dir
rdurnik Jan 28, 2026
12849a8
Abstracts are saved as tsv file
rdurnik Jan 28, 2026
9803780
Script for PDF download
rdurnik Jan 28, 2026
0f55a39
MeSH terms df saved as tsv
rdurnik Jan 28, 2026
2c5ff4a
Lint
rdurnik Jan 28, 2026
e691d6f
Chemical identification saved as file
rdurnik Jan 28, 2026
95c1524
Normalize chemical takes saved output and outputs a file
rdurnik Jan 28, 2026
2e91525
Chemical matching takes file and returns a file
rdurnik Jan 28, 2026
36239be
Find chemicals keeps text
rdurnik Jan 28, 2026
0c93372
Script to parse PDFs
rdurnik Jan 29, 2026
fd189fe
Renamed file
rdurnik Jan 29, 2026
56c0604
Abstract download fixed for Europe PMC
rdurnik Jan 29, 2026
19d6d32
Renamed column names
rdurnik Jan 29, 2026
ace2fe8
Script to find relationships
rdurnik Jan 29, 2026
7e331f2
Initial version chemical identification wrappers
rdurnik Feb 10, 2026
0bfc9c7
Made find chemicals into xml file
rdurnik Feb 25, 2026
b6f33b5
Updated version
rdurnik Feb 25, 2026
75acb8f
Replaced tsv test file with txt
rdurnik Feb 25, 2026
b401fd7
Changed extension
rdurnik Feb 25, 2026
c893ba7
Changed download abstracts into xml
rdurnik Feb 25, 2026
1adeee5
Fixed download abstracts name
rdurnik Mar 5, 2026
2ea1ee4
Fixed description
rdurnik Mar 5, 2026
d72834d
Download abstracts fixes
rdurnik Mar 5, 2026
12f21a3
Download PDF as XML
rdurnik Mar 5, 2026
a63a4a8
Removed print
rdurnik Mar 5, 2026
219fa0c
Updated text data set
rdurnik Mar 5, 2026
722fead
IDs test data set
rdurnik Mar 5, 2026
6e58b46
Find relationships XML
rdurnik Mar 5, 2026
5e09fd9
Removed MeSH term generation
rdurnik Mar 5, 2026
972f9b0
PDF parsing XML
rdurnik Mar 5, 2026
589c8ad
Query literature XML
rdurnik Mar 5, 2026
96c7a70
Changed chemical identification to output one chemical per line
rdurnik Mar 5, 2026
fb0df48
Chemical normalization
rdurnik Mar 5, 2026
67cebd2
Matching chemicals
rdurnik Mar 5, 2026
9d05d10
Description fix
rdurnik Mar 6, 2026
9320772
Needs to be defined before the other imports
rdurnik Mar 6, 2026
d1c3642
Fixed macros
rdurnik Mar 6, 2026
6961497
Renamed normalization to normalization PubChem
rdurnik Mar 6, 2026
43447bd
Changed description
rdurnik Mar 6, 2026
8a8114d
Merge branch 'master' into new_wrappers
hechth Mar 24, 2026
832eb76
removed imports
rdurnik Mar 25, 2026
89d1f73
added pmc
rdurnik Mar 25, 2026
2d2c0df
updated database selection
rdurnik Apr 9, 2026
13a7ee9
updated to get pdf
rdurnik Apr 9, 2026
20c5436
updated database selection
rdurnik Apr 9, 2026
42482c7
added openai key macro
rdurnik Apr 9, 2026
57d83a5
updated to openai_api_key_credentials
rdurnik Apr 9, 2026
fd700d2
updated name
rdurnik Apr 9, 2026
0f37098
wrapper to get full text
rdurnik Apr 9, 2026
01190f0
Name update
rdurnik Apr 10, 2026
8322fcb
added potential llm api key to pdf parsing
rdurnik Apr 10, 2026
ec59595
updated macros for llms
rdurnik Apr 10, 2026
f8c5550
find relationships with llms
rdurnik Apr 10, 2026
50156b6
updated pdf desc
rdurnik Apr 10, 2026
6891035
changed desc to text
rdurnik Apr 20, 2026
0deaeba
llm find chemicals
rdurnik Apr 20, 2026
87aefb8
new test file
rdurnik Apr 20, 2026
54b7e5c
normalization using llms
rdurnik Apr 20, 2026
30c8ba6
clean up, tests fix
rdurnik Apr 20, 2026
8d8e4c0
name change
rdurnik Apr 20, 2026
71e57a8
pdf test data
rdurnik Apr 21, 2026
e89e76f
requirement not supposed to be in inputs
rdurnik Apr 21, 2026
9cde057
pdf input file name change
rdurnik Apr 21, 2026
86dfc8f
file name rename
rdurnik Apr 21, 2026
4b7a32a
test fixes
rdurnik Apr 21, 2026
f07680a
added mesh terms normalization
rdurnik Apr 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions tools/aoptk/aoptk_chemical_identification.py
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be rewritten as a direct galaxy tool wrapper using the following syntax

with open($input_file, "r") as f_in, open($output_file, "w") as f_out:
    f_out.write("id\ttext\tchemicals\n")
    for row in csv.DictReader(f_in, delimiter="\t"):
        chemicals = Spacy().find_chemical(row["text"])
        chemicals_str = (
            "|".join(set([chem.name for chem in chemicals])) if chemicals else ""
        )
        f_out.write(f"{row['id']}\t{row['text']}\t{chemicals_str}\n")

Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from aoptk.chemical import Chemical
from aoptk.spacy_processor import Spacy
import argparse
import csv
import os


def find_chemicals(text: str) -> list[Chemical]:
"""Generate a list of chemicals from text.

Args:
text (str): Text to identify chemicals in.
"""
return Spacy().find_chemical(text)


def save_file(input_file: str, output_file: str) -> None:
"""Process a TSV file with text column, find chemicals, and save results.

Args:
input_file (str): Path to input TSV file with 'text' column.
output_file (str): Path to output TSV file.
"""
with open(input_file, "r") as f_in, open(output_file, "w") as f_out:
f_out.write("id\ttext\tchemicals\n")
for row in csv.DictReader(f_in, delimiter="\t"):
chemicals = find_chemicals(row["text"])
chemicals_str = (
"|".join(set([chem.name for chem in chemicals])) if chemicals else ""
)
f_out.write(f"{row['id']}\t{row['text']}\t{chemicals_str}\n")


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Identify chemicals in a TSV file with text column"
)
parser.add_argument(
"--input_file", required=True, help="Input TSV file with text column"
)
parser.add_argument(
"--outdir", required=True, help="Output directory for saving results"
)
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
output_file = os.path.join(args.outdir, "chemicals.tsv")
save_file(input_file=args.input_file, output_file=output_file)
92 changes: 92 additions & 0 deletions tools/aoptk/aoptk_chemical_matching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from aoptk.chemical import Chemical
import argparse
import pandas as pd


def extract_chemicals_to_match(input_file: str) -> list[Chemical]:
chemicals = []
with open(input_file, "r") as f_in:
for row in pd.read_csv(f_in, sep="\t").itertuples():
chemical = Chemical(row.name)
chemical.heading = row.heading
chemical._synonyms = (
set(row.synonyms.split(";")) if pd.notna(row.synonyms) else set()
)
chemicals.append(chemical)
return chemicals


def match_chemicals_with_loose_equality(
list_of_relevant_chemicals: list[Chemical],
chemicals: list[Chemical],
) -> list[str]:
"""Match normalized chemicals with relevant chemicals using loose equality.

Args:
list_of_relevant_chemicals (list[Chemical]): List of relevant chemicals.
chemicals (list[Chemical]): List of chemicals.
"""
relevant_chemicals_names = []
for chemical in chemicals:
for relevant_chemical in list_of_relevant_chemicals:
if chemical.similar(relevant_chemical):
relevant_chemicals_names.append(chemical.name)
break
return relevant_chemicals_names


def generate_relevant_chemicals(chemical_database: str) -> list[Chemical]:
"""Generate a list of relevant chemicals from Excel file.

Args:
chemical_database (str): Path to the user-defined chemical database in Excel.
"""
relevant_chemicals_database = pd.read_excel(chemical_database)
return [
Chemical(name)
for name in relevant_chemicals_database["chemical_name"]
.astype(str)
.str.lower()
.unique()
]


def save_file(relevant_chemicals_names: list[str], output_file: str) -> None:
"""Process a TSV file with chemicals, match them, and save results.

Args:
input_file (str): Path to input TSV file with chemicals.
output_file (str): Path to output TSV file.
"""
with open(output_file, "w") as f_out:
f_out.write("matched_chemicals\n")
for chemical_name in relevant_chemicals_names:
f_out.write(f"{chemical_name}\n")


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Match chemicals using loose equality")
parser.add_argument(
"--list_of_relevant_chemicals", required=True, help="List of relevant chemicals"
)
parser.add_argument(
"--normalized_chemicals", required=True, help="List of normalized chemicals"
)
parser.add_argument(
"--outdir", required=True, help="Output directory for saving files"
)
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
chemicals_to_match = extract_chemicals_to_match(args.normalized_chemicals)
list_of_relevant_chemicals = generate_relevant_chemicals(
args.list_of_relevant_chemicals
)
relevant_chemicals_names = match_chemicals_with_loose_equality(
list_of_relevant_chemicals, chemicals_to_match
)
save_file(
relevant_chemicals_names, output_file=f"{args.outdir}/matched_chemicals.tsv"
)
54 changes: 54 additions & 0 deletions tools/aoptk/aoptk_chemical_normalization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from aoptk.chemical import Chemical
from aoptk.normalization.mesh_terms import MeshTerms
import argparse
import pandas as pd
import os


def normalize_chemical(mesh_terms: pd.DataFrame, chemical: Chemical) -> Chemical:
"""Normalize a chemical using MeSH terms.

Args:
mesh_terms (pd.DataFrame): MeSH terms dataframe.
chemical (Chemical): Chemical to normalize.
"""
return MeshTerms(mesh_terms).normalize_chemical(chemical)


def save_file(input_file: str, mesh_terms_df: pd.DataFrame, output_file: str) -> None:
"""Process a TSV file with chemicals, normalize them, and save results.

Args:
input_file (str): Path to input TSV file with chemicals.
mesh_terms_df (pd.DataFrame): MeSH terms dataframe for normalization.
output_file (str): Path to output TSV file.
"""
with open(input_file, "r") as f_in, open(output_file, "w") as f_out:
f_out.write("name\theading\tsynonyms\n")
for row in pd.read_csv(f_in, sep="\t").itertuples():
chemicals = row.chemicals.split("|")
for chem in chemicals:
chemical = Chemical(chem.strip())
normalized_chemical = normalize_chemical(mesh_terms_df, chemical)
f_out.write(
f"{normalized_chemical.name}\t{normalized_chemical.heading}\t{normalized_chemical.synonyms}\n"
)


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Normalize chemicals using MeSH terms")
parser.add_argument("--mesh_terms", required=True, help="MeSH terms dataframe")
parser.add_argument(
"--input_file", required=True, help="Input TSV file with chemicals"
)
parser.add_argument(
"--outdir", required=True, help="Output directory for saving results"
)
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
mesh_terms_df = pd.read_csv(args.mesh_terms, sep="\t")
output_file = os.path.join(args.outdir, "normalized_chemicals.tsv")
save_file(args.input_file, mesh_terms_df, output_file)
71 changes: 71 additions & 0 deletions tools/aoptk/aoptk_download_abstracts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from aoptk.literature.databases.pubmed import PubMed
from aoptk.literature.databases.europepmc import EuropePMC
from aoptk.literature.abstract import Abstract
from Bio import Entrez

import argparse


def download_abstracts(
database_with_ids_path: str, database: str, email: str
) -> list[Abstract]:
"""Genereate a list of abstracts from the specified literature database.

Args:
database_with_ids_path (str): Path to the file containing database IDs.
"""
with open(database_with_ids_path, "r") as f:
ids = [line.strip() for line in f.readlines()]
if database == "pubmed":
Entrez.email = email
pubmed = PubMed.__new__(PubMed)
pubmed.id_list = ids
return pubmed.get_abstracts()
if database == "europepmc":
europepmc = EuropePMC("")
europepmc.id_list = ids
return europepmc.get_abstracts()
return None


def save_file(abstracts: list[Abstract], filename: str) -> None:
"""Save abstracts to a TSV file.

Args:
abstracts (list[Abstract]): List of abstracts to save.
filename (str): Name of the output file.
"""
with open(filename, "w") as f:
f.write("id\ttext\n")
for abstract in abstracts:
f.write(f"{abstract.publication_id}\t{abstract.text}\n")


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Download abstracts from PubMed or Europe PMC using aoptk"
)
parser.add_argument(
"--database_with_ids",
required=True,
help="Path to the file containing database IDs",
)
parser.add_argument(
"--database",
required=True,
choices=["pubmed", "europepmc"],
help="Database to query",
)
parser.add_argument(
"--email", required=True, help="Email to comply with NCBI guidelines"
)
parser.add_argument(
"--outdir", required=True, help="Output directory for saving files"
)
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
abstracts = download_abstracts(args.database_with_ids, args.database, args.email)
save_file(abstracts, f"{args.outdir}/abstracts.tsv")
42 changes: 42 additions & 0 deletions tools/aoptk/aoptk_download_pdfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from aoptk.literature.databases.europepmc import EuropePMC
from aoptk.literature.abstract import Abstract

import argparse


def download_pdfs(
database_with_ids_path: str,
output_dir: str,
) -> list[Abstract]:
"""Genereate a list of abstracts from the specified literature database.

Args:
database_with_ids_path (str): Path to the file containing database IDs.
"""
with open(database_with_ids_path, "r") as f:
ids = [line.strip() for line in f.readlines()]
europepmc = EuropePMC.__new__(EuropePMC)
europepmc.__init__("")
europepmc.storage = output_dir
europepmc.id_list = ids
return europepmc.pdfs()


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Download abstracts from PubMed or Europe PMC using aoptk"
)
parser.add_argument(
"--database_with_ids",
required=True,
help="Path to the file containing database IDs",
)
parser.add_argument(
"--outdir", required=True, help="Output directory for saving files"
)
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
abstracts = download_pdfs(args.database_with_ids, args.outdir)
66 changes: 66 additions & 0 deletions tools/aoptk/aoptk_find_relationships.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from aoptk.chemical import Chemical
from aoptk.effect import Effect
from aoptk.relationships.relationship import Relationship
from aoptk.relationships.zero_shot_classification_single import (
ZeroShotClassificationSingle,
)
import pandas as pd
import argparse
import os


def find_relationships(
text: str, chemicals: list[Chemical], effects: list[Effect]
) -> list[Relationship]:
"""Find relationships between chemicals and effects.

Args:
text (str): Input text to analyze.
chemicals (list[Chemical]): List of chemicals to consider.
effects (list[Effect]): List of effects to consider.
"""
return ZeroShotClassificationSingle().find_relationships(
text=text, chemicals=chemicals, effects=effects
)


def save_file(input_file: str, output_file: str, effects: list[Effect]) -> None:
"""Process a TSV file with chemicals and effects, find relationships, and save results.

Args:
input_file (str): Path to input TSV file with chemicals and effects.
output_file (str): Path to output TSV file or directory.
"""
if os.path.isdir(output_file):
output_file = os.path.join(output_file, "relationships.tsv")

with open(input_file, "r") as text_in, open(output_file, "w") as f_out:
f_out.write("id\tchemical\teffect\trelationship\n")
for row in pd.read_csv(text_in, sep="\t").itertuples():
chemicals = [Chemical(chem) for chem in row.chemicals.split("|")]
relationships = find_relationships(row.text, chemicals, effects)
for relationship in relationships:
f_out.write(
f"{row.id}\t{relationship.chemical.name}\t{relationship.effect.name}\t{relationship}\n"
)


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Find relationships between chemicals and effects"
)
parser.add_argument(
"--input_file", required=True, help="Input TSV file with chemicals and effects"
)
parser.add_argument(
"--outdir", required=True, help="Output directory for saving files"
)
parser.add_argument("--effects", required=True, help="List of effects to consider")
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
effects = [Effect(eff) for eff in args.effects.split(",")]
output_file = os.path.join(args.outdir, "relationships.tsv")
save_file(args.input_file, output_file, effects)
Loading
Loading