RECETOX · rdurnik · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026
diff --git a/tools/aoptk/aoptk_chemical_identification.py b/tools/aoptk/aoptk_chemical_identification.py
@@ -0,0 +1,50 @@
+from aoptk.chemical import Chemical
+from aoptk.spacy_processor import Spacy
+import argparse
+import csv
+import os
+
+
+def find_chemicals(text: str) -> list[Chemical]:
+    """Generate a list of chemicals from text.
+
+    Args:
+        text (str): Text to identify chemicals in.
+    """
+    return Spacy().find_chemical(text)
+
+
+def save_file(input_file: str, output_file: str) -> None:
+    """Process a TSV file with text column, find chemicals, and save results.
+
+    Args:
+        input_file (str): Path to input TSV file with 'text' column.
+        output_file (str): Path to output TSV file.
+    """
+    with open(input_file, "r") as f_in, open(output_file, "w") as f_out:
+        f_out.write("id\ttext\tchemicals\n")
+        for row in csv.DictReader(f_in, delimiter="\t"):
+            chemicals = find_chemicals(row["text"])
+            chemicals_str = (
+                "|".join(set([chem.name for chem in chemicals])) if chemicals else ""
+            )
+            f_out.write(f"{row['id']}\t{row['text']}\t{chemicals_str}\n")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Identify chemicals in a TSV file with text column"
+    )
+    parser.add_argument(
+        "--input_file", required=True, help="Input TSV file with text column"
+    )
+    parser.add_argument(
+        "--outdir", required=True, help="Output directory for saving results"
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    output_file = os.path.join(args.outdir, "chemicals.tsv")
+    save_file(input_file=args.input_file, output_file=output_file)
diff --git a/tools/aoptk/aoptk_chemical_matching.py b/tools/aoptk/aoptk_chemical_matching.py
@@ -0,0 +1,92 @@
+from aoptk.chemical import Chemical
+import argparse
+import pandas as pd
+
+
+def extract_chemicals_to_match(input_file: str) -> list[Chemical]:
+    chemicals = []
+    with open(input_file, "r") as f_in:
+        for row in pd.read_csv(f_in, sep="\t").itertuples():
+            chemical = Chemical(row.name)
+            chemical.heading = row.heading
+            chemical._synonyms = (
+                set(row.synonyms.split(";")) if pd.notna(row.synonyms) else set()
+            )
+            chemicals.append(chemical)
+    return chemicals
+
+
+def match_chemicals_with_loose_equality(
+    list_of_relevant_chemicals: list[Chemical],
+    chemicals: list[Chemical],
+) -> list[str]:
+    """Match normalized chemicals with relevant chemicals using loose equality.
+
+    Args:
+        list_of_relevant_chemicals (list[Chemical]): List of relevant chemicals.
+        chemicals (list[Chemical]): List of chemicals.
+    """
+    relevant_chemicals_names = []
+    for chemical in chemicals:
+        for relevant_chemical in list_of_relevant_chemicals:
+            if chemical.similar(relevant_chemical):
+                relevant_chemicals_names.append(chemical.name)
+                break
+    return relevant_chemicals_names
+
+
+def generate_relevant_chemicals(chemical_database: str) -> list[Chemical]:
+    """Generate a list of relevant chemicals from Excel file.
+
+    Args:
+        chemical_database (str): Path to the user-defined chemical database in Excel.
+    """
+    relevant_chemicals_database = pd.read_excel(chemical_database)
+    return [
+        Chemical(name)
+        for name in relevant_chemicals_database["chemical_name"]
+        .astype(str)
+        .str.lower()
+        .unique()
+    ]
+
+
+def save_file(relevant_chemicals_names: list[str], output_file: str) -> None:
+    """Process a TSV file with chemicals, match them, and save results.
+
+    Args:
+        input_file (str): Path to input TSV file with chemicals.
+        output_file (str): Path to output TSV file.
+    """
+    with open(output_file, "w") as f_out:
+        f_out.write("matched_chemicals\n")
+        for chemical_name in relevant_chemicals_names:
+            f_out.write(f"{chemical_name}\n")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Match chemicals using loose equality")
+    parser.add_argument(
+        "--list_of_relevant_chemicals", required=True, help="List of relevant chemicals"
+    )
+    parser.add_argument(
+        "--normalized_chemicals", required=True, help="List of normalized chemicals"
+    )
+    parser.add_argument(
+        "--outdir", required=True, help="Output directory for saving files"
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    chemicals_to_match = extract_chemicals_to_match(args.normalized_chemicals)
+    list_of_relevant_chemicals = generate_relevant_chemicals(
+        args.list_of_relevant_chemicals
+    )
+    relevant_chemicals_names = match_chemicals_with_loose_equality(
+        list_of_relevant_chemicals, chemicals_to_match
+    )
+    save_file(
+        relevant_chemicals_names, output_file=f"{args.outdir}/matched_chemicals.tsv"
+    )
diff --git a/tools/aoptk/aoptk_chemical_normalization.py b/tools/aoptk/aoptk_chemical_normalization.py
@@ -0,0 +1,54 @@
+from aoptk.chemical import Chemical
+from aoptk.normalization.mesh_terms import MeshTerms
+import argparse
+import pandas as pd
+import os
+
+
+def normalize_chemical(mesh_terms: pd.DataFrame, chemical: Chemical) -> Chemical:
+    """Normalize a chemical using MeSH terms.
+
+    Args:
+        mesh_terms (pd.DataFrame): MeSH terms dataframe.
+        chemical (Chemical): Chemical to normalize.
+    """
+    return MeshTerms(mesh_terms).normalize_chemical(chemical)
+
+
+def save_file(input_file: str, mesh_terms_df: pd.DataFrame, output_file: str) -> None:
+    """Process a TSV file with chemicals, normalize them, and save results.
+
+    Args:
+        input_file (str): Path to input TSV file with chemicals.
+        mesh_terms_df (pd.DataFrame): MeSH terms dataframe for normalization.
+        output_file (str): Path to output TSV file.
+    """
+    with open(input_file, "r") as f_in, open(output_file, "w") as f_out:
+        f_out.write("name\theading\tsynonyms\n")
+        for row in pd.read_csv(f_in, sep="\t").itertuples():
+            chemicals = row.chemicals.split("|")
+            for chem in chemicals:
+                chemical = Chemical(chem.strip())
+                normalized_chemical = normalize_chemical(mesh_terms_df, chemical)
+                f_out.write(
+                    f"{normalized_chemical.name}\t{normalized_chemical.heading}\t{normalized_chemical.synonyms}\n"
+                )
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Normalize chemicals using MeSH terms")
+    parser.add_argument("--mesh_terms", required=True, help="MeSH terms dataframe")
+    parser.add_argument(
+        "--input_file", required=True, help="Input TSV file with chemicals"
+    )
+    parser.add_argument(
+        "--outdir", required=True, help="Output directory for saving results"
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    mesh_terms_df = pd.read_csv(args.mesh_terms, sep="\t")
+    output_file = os.path.join(args.outdir, "normalized_chemicals.tsv")
+    save_file(args.input_file, mesh_terms_df, output_file)
diff --git a/tools/aoptk/aoptk_download_abstracts.py b/tools/aoptk/aoptk_download_abstracts.py
@@ -0,0 +1,71 @@
+from aoptk.literature.databases.pubmed import PubMed
+from aoptk.literature.databases.europepmc import EuropePMC
+from aoptk.literature.abstract import Abstract
+from Bio import Entrez
+
+import argparse
+
+
+def download_abstracts(
+    database_with_ids_path: str, database: str, email: str
+) -> list[Abstract]:
+    """Genereate a list of abstracts from the specified literature database.
+
+    Args:
+        database_with_ids_path (str): Path to the file containing database IDs.
+    """
+    with open(database_with_ids_path, "r") as f:
+        ids = [line.strip() for line in f.readlines()]
+    if database == "pubmed":
+        Entrez.email = email
+        pubmed = PubMed.__new__(PubMed)
+        pubmed.id_list = ids
+        return pubmed.get_abstracts()
+    if database == "europepmc":
+        europepmc = EuropePMC("")
+        europepmc.id_list = ids
+        return europepmc.get_abstracts()
+    return None
+
+
+def save_file(abstracts: list[Abstract], filename: str) -> None:
+    """Save abstracts to a TSV file.
+
+    Args:
+        abstracts (list[Abstract]): List of abstracts to save.
+        filename (str): Name of the output file.
+    """
+    with open(filename, "w") as f:
+        f.write("id\ttext\n")
+        for abstract in abstracts:
+            f.write(f"{abstract.publication_id}\t{abstract.text}\n")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Download abstracts from PubMed or Europe PMC using aoptk"
+    )
+    parser.add_argument(
+        "--database_with_ids",
+        required=True,
+        help="Path to the file containing database IDs",
+    )
+    parser.add_argument(
+        "--database",
+        required=True,
+        choices=["pubmed", "europepmc"],
+        help="Database to query",
+    )
+    parser.add_argument(
+        "--email", required=True, help="Email to comply with NCBI guidelines"
+    )
+    parser.add_argument(
+        "--outdir", required=True, help="Output directory for saving files"
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    abstracts = download_abstracts(args.database_with_ids, args.database, args.email)
+    save_file(abstracts, f"{args.outdir}/abstracts.tsv")
diff --git a/tools/aoptk/aoptk_download_pdfs.py b/tools/aoptk/aoptk_download_pdfs.py
@@ -0,0 +1,42 @@
+from aoptk.literature.databases.europepmc import EuropePMC
+from aoptk.literature.abstract import Abstract
+
+import argparse
+
+
+def download_pdfs(
+    database_with_ids_path: str,
+    output_dir: str,
+) -> list[Abstract]:
+    """Genereate a list of abstracts from the specified literature database.
+
+    Args:
+        database_with_ids_path (str): Path to the file containing database IDs.
+    """
+    with open(database_with_ids_path, "r") as f:
+        ids = [line.strip() for line in f.readlines()]
+    europepmc = EuropePMC.__new__(EuropePMC)
+    europepmc.__init__("")
+    europepmc.storage = output_dir
+    europepmc.id_list = ids
+    return europepmc.pdfs()
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Download abstracts from PubMed or Europe PMC using aoptk"
+    )
+    parser.add_argument(
+        "--database_with_ids",
+        required=True,
+        help="Path to the file containing database IDs",
+    )
+    parser.add_argument(
+        "--outdir", required=True, help="Output directory for saving files"
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    abstracts = download_pdfs(args.database_with_ids, args.outdir)
diff --git a/tools/aoptk/aoptk_find_relationships.py b/tools/aoptk/aoptk_find_relationships.py
@@ -0,0 +1,66 @@
+from aoptk.chemical import Chemical
+from aoptk.effect import Effect
+from aoptk.relationships.relationship import Relationship
+from aoptk.relationships.zero_shot_classification_single import (
+    ZeroShotClassificationSingle,
+)
+import pandas as pd
+import argparse
+import os
+
+
+def find_relationships(
+    text: str, chemicals: list[Chemical], effects: list[Effect]
+) -> list[Relationship]:
+    """Find relationships between chemicals and effects.
+
+    Args:
+        text (str): Input text to analyze.
+        chemicals (list[Chemical]): List of chemicals to consider.
+        effects (list[Effect]): List of effects to consider.
+    """
+    return ZeroShotClassificationSingle().find_relationships(
+        text=text, chemicals=chemicals, effects=effects
+    )
+
+
+def save_file(input_file: str, output_file: str, effects: list[Effect]) -> None:
+    """Process a TSV file with chemicals and effects, find relationships, and save results.
+
+    Args:
+        input_file (str): Path to input TSV file with chemicals and effects.
+        output_file (str): Path to output TSV file or directory.
+    """
+    if os.path.isdir(output_file):
+        output_file = os.path.join(output_file, "relationships.tsv")
+
+    with open(input_file, "r") as text_in, open(output_file, "w") as f_out:
+        f_out.write("id\tchemical\teffect\trelationship\n")
+        for row in pd.read_csv(text_in, sep="\t").itertuples():
+            chemicals = [Chemical(chem) for chem in row.chemicals.split("|")]
+            relationships = find_relationships(row.text, chemicals, effects)
+            for relationship in relationships:
+                f_out.write(
+                    f"{row.id}\t{relationship.chemical.name}\t{relationship.effect.name}\t{relationship}\n"
+                )
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Find relationships between chemicals and effects"
+    )
+    parser.add_argument(
+        "--input_file", required=True, help="Input TSV file with chemicals and effects"
+    )
+    parser.add_argument(
+        "--outdir", required=True, help="Output directory for saving files"
+    )
+    parser.add_argument("--effects", required=True, help="List of effects to consider")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    effects = [Effect(eff) for eff in args.effects.split(",")]
+    output_file = os.path.join(args.outdir, "relationships.tsv")
+    save_file(args.input_file, output_file, effects)