From e76f6a9c5483b91883d9abfc6c0147a57e65e726 Mon Sep 17 00:00:00 2001 From: Justin Payne Date: Thu, 21 May 2026 18:17:46 -0500 Subject: [PATCH 1/2] Add bioproject2srr Galaxy tool for retrieving SRR accessions from BioProjects Retrieves SRR accessions and biosample metadata from NCBI BioProjects using Entrez E-utilities API. Recursively follows links to subprojects and biosamples. Outputs: - accessions.txt: deduplicated list of SRR accessions - metadata.tsv: full metadata table joining biosample attributes with SRA runs Co-Authored-By: Claude Sonnet 4.5 --- tools/bioproject2srr/bioproject2srr/.shed.yml | 6 + .../bioproject2srr/bioproject2srr/bio2srr.py | 276 ++++++++++++++++++ .../bioproject2srr/bioproject2srr/bio2srr.xml | 31 ++ .../bioproject2srr/job_conf.yml | 34 +++ .../bioproject2srr/test-data/accessions.txt | 5 + .../bioproject2srr/test-data/metadata.tsv | 8 + tools/bioproject2srr/bioproject2srr/tests.py | 42 +++ 7 files changed, 402 insertions(+) create mode 100644 tools/bioproject2srr/bioproject2srr/.shed.yml create mode 100644 tools/bioproject2srr/bioproject2srr/bio2srr.py create mode 100644 tools/bioproject2srr/bioproject2srr/bio2srr.xml create mode 100644 tools/bioproject2srr/bioproject2srr/job_conf.yml create mode 100644 tools/bioproject2srr/bioproject2srr/test-data/accessions.txt create mode 100644 tools/bioproject2srr/bioproject2srr/test-data/metadata.tsv create mode 100644 tools/bioproject2srr/bioproject2srr/tests.py diff --git a/tools/bioproject2srr/bioproject2srr/.shed.yml b/tools/bioproject2srr/bioproject2srr/.shed.yml new file mode 100644 index 0000000..59f0183 --- /dev/null +++ b/tools/bioproject2srr/bioproject2srr/.shed.yml @@ -0,0 +1,6 @@ +categories: ['Tools'] +description: Give a Bioproject accession, get a list of associated SRR accessions +name: bioproject2srr +owner: jpayne +ignore: +- CLAUDE.md diff --git a/tools/bioproject2srr/bioproject2srr/bio2srr.py b/tools/bioproject2srr/bioproject2srr/bio2srr.py new file mode 100644 index 0000000..09a6456 --- /dev/null +++ b/tools/bioproject2srr/bioproject2srr/bio2srr.py @@ -0,0 +1,276 @@ +"Grab SRR numbers from Bioprojects and sub-bioprojects via Eutils" + +import requests +import sys +import csv +import os + +try: + from itertools import batched +except ImportError: + from itertools import islice + def batched(iterable, n): + "Batch data into tuples of length n. The last batch may be shorter." + # batched('ABCDEFG', 3) --> ABC DEF G + if n < 1: + raise ValueError('n must be at least one') + it = iter(iterable) + while batch := tuple(islice(it, n)): + yield batch +from functools import cmp_to_key +from time import sleep +from xml.etree import ElementTree as xml + +esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" +esummary = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" +elink = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi" + + +import logging +logging.basicConfig(level=logging.INFO) + +logger = logging.getLogger("bio2srr") + +extra_params = {} + +api_key = os.environ.get("NCBI_API_KEY") + +if api_key: + logger.info(f"Using NCBI API key {api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}") + extra_params["api_key"] = api_key + +def log(msg): + if api_key: + logger.info(msg.replace(api_key, f"{api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}")) # fix logging later + else: + logger.info(msg) + +def get_tag(root, tag): + val = root.find(tag) + if val is not None: + return val.text + log(f"No result for {tag}") + + + +def header_sort_override(a, b): + if a == b: + return 0 + try: + for name in ["bioproject", "srr_accession", "biosample_accession", "organism", "taxid", "package",]: + if a == name: + return -1 + if b == name: + return 1 + except: + pass + if a < b: + return -1 + else: + return 1 + +hso = cmp_to_key(header_sort_override) + +def resolve_bioproject_ids_and_links(bioproject_id_list): + "Recursively follow bioproject and biosample links, yield biosample UID's and biosample XML" + for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list): + log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}") + #get bioproject to bioproject links + response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params)) + response.raise_for_status() + reply = response.json() + linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}]) + if len(linksets) >= 3: + for id in linksets[2].get("links", []): #third index is the up to down links + response = requests.get(esummary, params=dict(id=id, db="bioproject", format="json")) + response.raise_for_status() + replyy = response.json() + biop = replyy["result"][id]["project_acc"] + if id not in bioproject_id_list: + bioproject_id_list.append((biop, id)) # recurse over bioproject links + # get bioproject to biosample links + response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params)) + response.raise_for_status() + reply = response.json() + links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []) + log(f"Found {len(links)} biosample links for {bioproject} ({bioproject_id})") + for ids in batched(links, 200): + response = requests.get(esummary, params=dict(id=",".join(ids), db="biosample", format="json")) + response.raise_for_status() + replyy = response.json() + for field, value in replyy.get("result", {}).items(): + if "uids" not in field: + yield bioproject, field, value["sampledata"] # this is XML, deleriously + sleep(1 if not api_key else 0.1) + + +biosample_example = """ + + + SAMN17131268 + CJP19-D996 + + + Pathogen: environmental/food/other sample from Campylobacter jejuni + + Campylobacter jejuni + + + + FDA Center for Food Safety and Applied Nutrition + + + Pathogen.env + + Pathogen.env.1.0 + + CJP19-D996 + missing + missing + CDC + missing + missing + CFSAN091032 + GenomeTrakr + FDA Center for Food Safety and Applied Nutrition + + + 681235 + + + + +""" + +def flatten_biosample_xml(biosampxml): + root = xml.fromstring(biosampxml) + accession = get_tag(root, r'.//Id[@db="BioSample"]') + # sample_name = get_tag(root, r'.//Id[@db_label="Sample name"]') + organism = get_tag(root, r".//OrganismName") + tax_id = root.find(r".//Organism").attrib.get("taxonomy_id") + package = get_tag(root, r".//Package") + sampledict = dict( + biosample_accession=accession, + # sample_name=sample_name, + organism = organism, + taxid = tax_id, + package = package + ) + for attribute in root.findall("Attributes/Attribute"): + sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text + + return sampledict + + +def yield_sra_runs_from_sample(biosample): + sleep(1 if not api_key else 0.1) + response = requests.get(elink, params=dict(id=biosample, dbfrom="biosample", db="sra", format="json", **extra_params)) + response.raise_for_status() + reply = response.json() + for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200): + sleep(1 if not api_key else 0.1) + response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json", **extra_params)) + response.raise_for_status() + replyy = response.json() + for field, value in replyy.get("result", {}).items(): + if "uids" not in field: + yield field, value.get("runs") + + +runs_example = """ + + +""" + +def flatten_runs(runxml): + root = xml.fromstring(f"{runxml}") # gotta fix their garbage embedded XML since it isn't singly-rooted + for run in root.findall(".//Run"): + if run.attrib["is_public"] == "false": + logger.warning(f"Skipping non-public run {run.attrib['acc']}") + yield dict( + sra_run_accession = run.attrib["acc"], + total_spots = run.attrib["total_spots"], + total_bases = run.attrib["total_bases"], + ) + + + +def main(starting_bioproject): + rows = [] + response = requests.get(esearch, params=dict(db="bioproject", term=starting_bioproject, field="PRJA", format="json")) + response.raise_for_status() + reply = response.json() + try: + bioproject_id = reply["esearchresult"]["idlist"][0] + log(f"Found UID {bioproject_id} for '{starting_bioproject}'") + except IndexError: + logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"") + sys.exit(1) + sleep(1 if not api_key else 0.1) + for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]): + try: + sampledict = flatten_biosample_xml(biosample_xml) + except KeyError: + log(biosample_xml) + raise + sampledict["bioproject"] = bioproject + noruns = True + for sra, runs in yield_sra_runs_from_sample(biosample): + for run in flatten_runs(runs.strip()): + noruns = False + run.update(sampledict) + rows.append(run) + if noruns: + rows.append(sampledict) + + log(f"Writing {len(rows)} rows to metadata.tsv") + + header = set() + for row in rows: + for key in row.keys(): + header.add(key) + + header = sorted(list(header), key=hso) + # logger.info(f"Header: {header}") + + rows.sort(key=lambda x: x["biosample_accession"]) + + with open("metadata.tsv", "w") as f: + writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel") + writer.writeheader() + writer.writerows(rows) + + # check for duplicate runs and unreleased samples + + accessions = [row.get("sra_run_accession") for row in rows if row.get("sra_run_accession")] + + raw_length = len(accessions) + + accessions = sorted(list(set(accessions))) + + if raw_length < len(rows): + logger.warning(f"Bioproject {starting_bioproject} contains unreleased samples. {len(rows) - raw_length} samples will not be included in accessions.txt") + + if len(accessions) < raw_length: + logger.warning(f"Some SRA runs may have been reached through multiple projects or samples. accessions.txt will be deduplicated but the metadata table is not") + + log(f"Writing {len(accessions)} unique accessions to accessions.txt") + + with open("accessions.txt", "w") as f: + for accession in accessions: + f.write(accession + "\n") + + +if __name__ == "__main__": + b = sys.argv[1].strip() + log(f"Starting with {b}") + try: + main(b) + except requests.HTTPError as e: + logger.error(e) + sys.exit(1) + + + + + diff --git a/tools/bioproject2srr/bioproject2srr/bio2srr.xml b/tools/bioproject2srr/bioproject2srr/bio2srr.xml new file mode 100644 index 0000000..6f76446 --- /dev/null +++ b/tools/bioproject2srr/bioproject2srr/bio2srr.xml @@ -0,0 +1,31 @@ + + Retrieve SRR accessions and sample metadata from BioProject. Recursively follows links to subprojects. + + docker.io/crashfrog/bp2srr-galaxy:latest + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/tools/bioproject2srr/bioproject2srr/job_conf.yml b/tools/bioproject2srr/bioproject2srr/job_conf.yml new file mode 100644 index 0000000..529dd53 --- /dev/null +++ b/tools/bioproject2srr/bioproject2srr/job_conf.yml @@ -0,0 +1,34 @@ +runners: + local: + load: galaxy.jobs.runners.local:LocalJobRunner + workers: 16 + +# handling: +# processes: +# handler0: + +execution: + default: local + environments: + local: + runner: local + docker_local: + runner: local + docker_enabled: true + # container: "auto" + docker_volumes: $defaults + # docker_set_user: null + docker_run_extra_arguments: "--entrypoint ''" + docker_set_user: root + +tools: +- id: bio2srr + # handler: handler0 + environment: docker_local + +limits: +- + # Amount of time a job can run (in any environment) before it + # will be terminated by Galaxy. + type: walltime + value: '01:00:00' \ No newline at end of file diff --git a/tools/bioproject2srr/bioproject2srr/test-data/accessions.txt b/tools/bioproject2srr/bioproject2srr/test-data/accessions.txt new file mode 100644 index 0000000..94346ce --- /dev/null +++ b/tools/bioproject2srr/bioproject2srr/test-data/accessions.txt @@ -0,0 +1,5 @@ +SRR13160357 +SRR13160358 +SRR13160359 +SRR13160360 +SRR13167188 diff --git a/tools/bioproject2srr/bioproject2srr/test-data/metadata.tsv b/tools/bioproject2srr/bioproject2srr/test-data/metadata.tsv new file mode 100644 index 0000000..2bb4d4c --- /dev/null +++ b/tools/bioproject2srr/bioproject2srr/test-data/metadata.tsv @@ -0,0 +1,8 @@ +bioproject biosample_accession organism taxid package Genus ProjectAccession PublicAccession Species attribute_package collected_by collection_date geo_loc_name isolate isolate_name_alias isolation_source lat_lon project_name sequenced_by sra_run_accession strain total_bases total_spots +PRJNA681235 SAMN16946945 Escherichia coli O157:H7 83334 Pathogen.env.1.0 Escherichia PRJNA681235 CFSAN091029 coli environmental/food/other CDC 2019 USA CFSAN091029 missing missing GenomeTrakr FDA Center for Food Safety and Applied Nutrition SRR13160357 ECP19-2498 312756765 660858 +PRJNA681235 SAMN16946945 Escherichia coli O157:H7 83334 Pathogen.env.1.0 Escherichia PRJNA681235 CFSAN091029 coli environmental/food/other CDC 2019 USA CFSAN091029 missing missing GenomeTrakr FDA Center for Food Safety and Applied Nutrition SRR13160358 ECP19-2498 327001270 704624 +PRJNA681235 SAMN16946946 Escherichia coli O157:H7 83334 Pathogen.env.1.0 Escherichia PRJNA681235 CFSAN091027 coli environmental/food/other CDC 2019 USA CFSAN091027 missing missing GenomeTrakr FDA Center for Food Safety and Applied Nutrition SRR13160360 ECP19-598 316865532 683880 +PRJNA681235 SAMN16946947 Escherichia coli O157:H7 83334 Pathogen.env.1.0 Escherichia PRJNA681235 CFSAN091028 coli environmental/food/other CDC 2019 USA CFSAN091028 missing missing GenomeTrakr FDA Center for Food Safety and Applied Nutrition SRR13160359 ECP19-798 473318585 1007158 +PRJNA681235 SAMN16956340 Escherichia coli O157:H7 83334 Pathogen.env.1.0 Escherichia PRJNA681235 CFSAN091030 coli environmental/food/other CDC 2019 USA CFSAN091030 missing missing GenomeTrakr FDA Center for Food Safety and Applied Nutrition SRR13167188 ECP19-198 385043067 827691 +PRJNA681235 SAMN17131267 Campylobacter jejuni 197 Pathogen.env.1.0 CDC missing missing CFSAN091031 missing missing GenomeTrakr FDA Center for Food Safety and Applied Nutrition CJP19-D445 +PRJNA681235 SAMN17131268 Campylobacter jejuni 197 Pathogen.env.1.0 CDC missing missing CFSAN091032 missing missing GenomeTrakr FDA Center for Food Safety and Applied Nutrition CJP19-D996 diff --git a/tools/bioproject2srr/bioproject2srr/tests.py b/tools/bioproject2srr/bioproject2srr/tests.py new file mode 100644 index 0000000..a9a5e18 --- /dev/null +++ b/tools/bioproject2srr/bioproject2srr/tests.py @@ -0,0 +1,42 @@ +import pytest + +from bio2srr import * + + +def test_element_tree_xpath(): + from xml.etree import ElementTree as xml + root = xml.fromstring(biosample_example) + assert root.find(".//Id[@db='BioSample']") is not None + +def test_flatten_biosample_xml(): + d = flatten_biosample_xml(biosample_example) + assert d['biosample_accession'] == 'SAMN17131268' + assert d['organism'] == 'Campylobacter jejuni' + assert d['isolate'] == 'CFSAN091032' + +def test_flatten_runs(): + d = list(flatten_runs(runs_example)) + assert len(d) == 2 + +def test_header_sort_override_consistency(): + import random + L = ["C", "B", "A", "taxid", "bioproject"] + L.sort(key=hso) + # assert L[0] == "bioproject" + A = L.copy() + assert A == L + R = [] + for _ in range(100): + random.shuffle(A) + A.sort(key=hso) + R.append(A == L) + assert all(R) + +def test_hso_override(): + assert header_sort_override("bioproject", "taxid") < 0 + assert header_sort_override("taxid", "bioproject") > 0 + assert header_sort_override("taxid", "taxid") == 0 + +def test_hso_regular(): + assert header_sort_override("A", "B") < 0 + assert header_sort_override("B", "A") > 0 \ No newline at end of file From 4f8e0a9ddd47d9d17469b7dba24b1c41f04e1c26 Mon Sep 17 00:00:00 2001 From: Justin Payne Date: Thu, 21 May 2026 18:18:10 -0500 Subject: [PATCH 2/2] Add srr2sam tool for reverse SRR to BioSample lookup and documentation Adds complementary tool that retrieves BioSample metadata from a list of SRR accessions. Performs reverse operation of bio2srr by traversing NCBI Entrez from SRA to BioSample. Output: tabular file with SRR accession as first column followed by all BioSample metadata fields. Also adds CLAUDE.md documentation covering both tools' architecture, testing, and constraints. Co-Authored-By: Claude Sonnet 4.5 --- tools/bioproject2srr/bioproject2srr/CLAUDE.md | 98 ++++++++ .../bioproject2srr/bioproject2srr/srr2sam.py | 209 ++++++++++++++++++ .../bioproject2srr/bioproject2srr/srr2sam.xml | 37 ++++ .../test-data/biosample_metadata.tsv | 4 + .../test-data/srr_test_input.txt | 3 + 5 files changed, 351 insertions(+) create mode 100644 tools/bioproject2srr/bioproject2srr/CLAUDE.md create mode 100644 tools/bioproject2srr/bioproject2srr/srr2sam.py create mode 100644 tools/bioproject2srr/bioproject2srr/srr2sam.xml create mode 100644 tools/bioproject2srr/bioproject2srr/test-data/biosample_metadata.tsv create mode 100644 tools/bioproject2srr/bioproject2srr/test-data/srr_test_input.txt diff --git a/tools/bioproject2srr/bioproject2srr/CLAUDE.md b/tools/bioproject2srr/bioproject2srr/CLAUDE.md new file mode 100644 index 0000000..49b9f2c --- /dev/null +++ b/tools/bioproject2srr/bioproject2srr/CLAUDE.md @@ -0,0 +1,98 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Overview + +This directory contains two complementary Galaxy tools for working with NCBI metadata: + +### bio2srr (BioProject → SRR) +Galaxy tool wrapper for `bio2srr.py`, which retrieves SRR (Sequence Read Archive) accessions and biosample metadata from NCBI BioProjects using the Entrez E-utilities API. The tool recursively follows links to subprojects and biosamples, producing two outputs: +- `accessions.txt`: deduplicated list of SRR accessions +- `metadata.tsv`: full metadata table joining biosample attributes with SRA run information + +### srr2sam (SRR → BioSample) +Galaxy tool wrapper for `srr2sam.py`, which performs the reverse operation: given a list of SRR accessions, retrieves associated BioSample metadata. Produces: +- `output.tsv`: tabular file with SRR accession as first column, followed by all BioSample metadata fields + +Both tools are containerized and run via Docker in Galaxy with the image `docker.io/crashfrog/bp2srr-galaxy:latest`. + +## Key Architecture + +### Core Logic Flow (bio2srr.py) +1. **ID Resolution**: Convert BioProject accession → UID via esearch +2. **Recursive Link Following**: `resolve_bioproject_ids_and_links()` walks BioProject→BioProject and BioProject→BioSample links +3. **XML Parsing**: `flatten_biosample_xml()` extracts structured metadata from BioSample XML +4. **SRA Mapping**: `yield_sra_runs_from_sample()` links BioSamples to SRA runs via elink +5. **Output Generation**: Produces sorted, deduplicated TSV with custom header ordering + +### E-utilities API Pattern +All NCBI API calls use `requests.get()` with: +- Rate limiting: 1s delay (no API key) or 0.1s (with `NCBI_API_KEY`) +- Batching: process IDs in groups of 200 for esummary calls +- Error handling: `raise_for_status()` on all responses + +### Core Logic Flow (srr2sam.py) +1. **SRR → SRA UID**: Convert SRR accession → SRA UID via esearch +2. **SRA → BioSample Link**: Use elink to traverse from SRA to BioSample database +3. **BioSample Metadata Fetch**: Retrieve BioSample XML via esummary +4. **XML Parsing**: Reuse `flatten_biosample_xml()` from bio2srr.py +5. **Output Generation**: TSV with SRR accession as first column + +### Header Ordering +Both tools use `header_sort_override()` to enforce priority fields at the left: +- **bio2srr**: `bioproject, srr_accession, biosample_accession, organism, taxid, package` then alphabetical +- **srr2sam**: `sra_run_accession, biosample_accession, organism, taxid, package` then alphabetical + +## Testing + +**Run unit tests:** +```bash +pytest tests.py -v +``` + +**Run Galaxy tool tests:** +Both tools include functional tests that are run by Galaxy's Planemo test framework. Test data is in `test-data/`. + +**bio2srr.xml:** +- Input: `PRJNA681235` (BioProject accession) +- Expected outputs: `accessions.txt`, `metadata.tsv` +- Negative test: `NOTHING` (expects failure) + +**srr2sam.xml:** +- Input: `srr_test_input.txt` (3 SRR accessions) +- Expected output: `biosample_metadata.tsv` + +To run Galaxy tool tests locally: +```bash +planemo test --galaxy_python_version 3.11 bio2srr.xml +planemo test --galaxy_python_version 3.11 srr2sam.xml +``` + +## Tool Shed Publishing + +This tool is published to the Galaxy ToolShed with metadata in `.shed.yml`: +- Owner: jpayne +- Category: Tools +- Name: bioproject2srr + +## Dependencies + +The Python script uses: +- `requests` for HTTP calls to NCBI E-utilities +- `xml.etree.ElementTree` for parsing BioSample and SRA run XML +- `itertools.batched` (or backport for Python <3.12) +- Environment variable `NCBI_API_KEY` (optional, increases rate limit) + +## Important Constraints + +**bio2srr:** +- Handles unreleased samples (samples without SRA runs) by including them in metadata but warning about omission from accessions list +- Duplicate SRA runs (reached via multiple projects/samples) are deduplicated in `accessions.txt` but retained in metadata +- Non-public SRA runs are skipped with a warning +- API responses contain "deliriously" embedded XML fragments that require wrapping in `` root tags before parsing + +**srr2sam:** +- If an SRR accession cannot be resolved to a BioSample, the row will contain "NOT_FOUND" in the biosample_accession column +- If BioSample XML parsing fails, the row will contain "PARSE_ERROR" +- Takes the first BioSample UID when an SRA record links to multiple BioSamples (rare edge case) diff --git a/tools/bioproject2srr/bioproject2srr/srr2sam.py b/tools/bioproject2srr/bioproject2srr/srr2sam.py new file mode 100644 index 0000000..85ea442 --- /dev/null +++ b/tools/bioproject2srr/bioproject2srr/srr2sam.py @@ -0,0 +1,209 @@ +"Retrieve BioSample metadata from SRR accessions via Eutils" + +import requests +import sys +import csv +import os + +try: + from itertools import batched +except ImportError: + from itertools import islice + def batched(iterable, n): + "Batch data into tuples of length n. The last batch may be shorter." + if n < 1: + raise ValueError('n must be at least one') + it = iter(iterable) + while batch := tuple(islice(it, n)): + yield batch + +from functools import cmp_to_key +from time import sleep +from xml.etree import ElementTree as xml + +esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" +esummary = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" +elink = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi" + +import logging +logging.basicConfig(level=logging.INFO) + +logger = logging.getLogger("srr2sam") + +extra_params = {} + +api_key = os.environ.get("NCBI_API_KEY") + +if api_key: + logger.info(f"Using NCBI API key {api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}") + extra_params["api_key"] = api_key + +def log(msg): + if api_key: + logger.info(msg.replace(api_key, f"{api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}")) + else: + logger.info(msg) + +def get_tag(root, tag): + val = root.find(tag) + if val is not None: + return val.text + log(f"No result for {tag}") + +def header_sort_override(a, b): + if a == b: + return 0 + try: + for name in ["sra_run_accession", "biosample_accession", "organism", "taxid", "package"]: + if a == name: + return -1 + if b == name: + return 1 + except: + pass + if a < b: + return -1 + else: + return 1 + +hso = cmp_to_key(header_sort_override) + +def flatten_biosample_xml(biosampxml): + "Parse BioSample XML and return dict of attributes" + root = xml.fromstring(biosampxml) + accession = get_tag(root, r'.//Id[@db="BioSample"]') + organism = get_tag(root, r".//OrganismName") + tax_id = root.find(r".//Organism").attrib.get("taxonomy_id") + package = get_tag(root, r".//Package") + sampledict = dict( + biosample_accession=accession, + organism=organism, + taxid=tax_id, + package=package + ) + for attribute in root.findall("Attributes/Attribute"): + sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text + + return sampledict + +def srr_to_biosample_uid(srr_accession): + "Convert SRR accession to BioSample UID via Entrez" + sleep(1 if not api_key else 0.1) + + # First convert SRR accession to SRA UID + response = requests.get(esearch, params=dict(db="sra", term=srr_accession, format="json", **extra_params)) + response.raise_for_status() + reply = response.json() + + try: + sra_uid = reply["esearchresult"]["idlist"][0] + log(f"Found SRA UID {sra_uid} for '{srr_accession}'") + except IndexError: + logger.warning(f"No SRA UID found for '{srr_accession}'") + return None + + sleep(1 if not api_key else 0.1) + + # Link from SRA to BioSample + response = requests.get(elink, params=dict(id=sra_uid, dbfrom="sra", db="biosample", format="json", **extra_params)) + response.raise_for_status() + reply = response.json() + + biosample_uids = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []) + + if not biosample_uids: + logger.warning(f"No BioSample link found for SRR {srr_accession} (UID {sra_uid})") + return None + + return biosample_uids[0] # Return first BioSample UID + +def get_biosample_metadata(biosample_uid): + "Fetch BioSample metadata XML via esummary" + sleep(1 if not api_key else 0.1) + + response = requests.get(esummary, params=dict(id=biosample_uid, db="biosample", format="json", **extra_params)) + response.raise_for_status() + reply = response.json() + + biosample_xml = reply.get("result", {}).get(biosample_uid, {}).get("sampledata") + + if not biosample_xml: + logger.warning(f"No metadata found for BioSample UID {biosample_uid}") + return None + + return biosample_xml + +def main(input_file): + rows = [] + + # Read SRR accessions from input file + with open(input_file, 'r') as f: + srr_accessions = [line.strip() for line in f if line.strip()] + + log(f"Processing {len(srr_accessions)} SRR accessions") + + for i, srr in enumerate(srr_accessions, 1): + log(f"Processing {srr} ({i}/{len(srr_accessions)})") + + # Get BioSample UID + biosample_uid = srr_to_biosample_uid(srr) + if not biosample_uid: + # Create minimal row for failed lookups + rows.append({"sra_run_accession": srr, "biosample_accession": "NOT_FOUND"}) + continue + + # Get BioSample metadata + biosample_xml = get_biosample_metadata(biosample_uid) + if not biosample_xml: + rows.append({"sra_run_accession": srr, "biosample_accession": "NOT_FOUND"}) + continue + + # Parse metadata + try: + metadata = flatten_biosample_xml(biosample_xml) + metadata["sra_run_accession"] = srr + rows.append(metadata) + except Exception as e: + logger.error(f"Error parsing BioSample metadata for {srr}: {e}") + rows.append({"sra_run_accession": srr, "biosample_accession": "PARSE_ERROR"}) + + log(f"Writing {len(rows)} rows to output.tsv") + + # Collect all headers + header = set() + for row in rows: + for key in row.keys(): + header.add(key) + + header = sorted(list(header), key=hso) + + # Sort by SRR accession + rows.sort(key=lambda x: x.get("sra_run_accession", "")) + + # Write output + with open("output.tsv", "w") as f: + writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel") + writer.writeheader() + writer.writerows(rows) + + log("Complete") + +if __name__ == "__main__": + if len(sys.argv) < 2: + logger.error("Usage: python srr2sam.py ") + sys.exit(1) + + input_file = sys.argv[1] + + if not os.path.exists(input_file): + logger.error(f"Input file not found: {input_file}") + sys.exit(1) + + try: + main(input_file) + except requests.HTTPError as e: + logger.error(f"HTTP error: {e}") + sys.exit(1) + except Exception as e: + logger.error(f"Error: {e}") + sys.exit(1) diff --git a/tools/bioproject2srr/bioproject2srr/srr2sam.xml b/tools/bioproject2srr/bioproject2srr/srr2sam.xml new file mode 100644 index 0000000..b780e26 --- /dev/null +++ b/tools/bioproject2srr/bioproject2srr/srr2sam.xml @@ -0,0 +1,37 @@ + + Retrieve BioSample metadata from SRR accessions. Traverses NCBI Entrez from SRA to BioSample. + + docker.io/crashfrog/bp2srr-galaxy:latest + + + + + + + + + + + + + + + + + + diff --git a/tools/bioproject2srr/bioproject2srr/test-data/biosample_metadata.tsv b/tools/bioproject2srr/bioproject2srr/test-data/biosample_metadata.tsv new file mode 100644 index 0000000..a7f1a79 --- /dev/null +++ b/tools/bioproject2srr/bioproject2srr/test-data/biosample_metadata.tsv @@ -0,0 +1,4 @@ +sra_run_accession biosample_accession organism taxid package Genus ProjectAccession PublicAccession Species attribute_package collected_by collection_date geo_loc_name isolate_name_alias isolation_source lat_lon project_name sequenced_by strain +SRR13160357 SAMN16946945 Escherichia coli O157:H7 83334 Pathogen.env.1.0 Escherichia PRJNA681235 CFSAN091029 coli environmental/food/other CDC 2019 USA CFSAN091029 missing missing GenomeTrakr FDA Center for Food Safety and Applied Nutrition ECP19-2498 +SRR13160358 SAMN16946945 Escherichia coli O157:H7 83334 Pathogen.env.1.0 Escherichia PRJNA681235 CFSAN091029 coli environmental/food/other CDC 2019 USA CFSAN091029 missing missing GenomeTrakr FDA Center for Food Safety and Applied Nutrition ECP19-2498 +SRR13160359 SAMN16946947 Escherichia coli O157:H7 83334 Pathogen.env.1.0 Escherichia PRJNA681235 CFSAN091028 coli environmental/food/other CDC 2019 USA CFSAN091028 missing missing GenomeTrakr FDA Center for Food Safety and Applied Nutrition ECP19-798 diff --git a/tools/bioproject2srr/bioproject2srr/test-data/srr_test_input.txt b/tools/bioproject2srr/bioproject2srr/test-data/srr_test_input.txt new file mode 100644 index 0000000..aab78da --- /dev/null +++ b/tools/bioproject2srr/bioproject2srr/test-data/srr_test_input.txt @@ -0,0 +1,3 @@ +SRR13160357 +SRR13160358 +SRR13160359