From e76f6a9c5483b91883d9abfc6c0147a57e65e726 Mon Sep 17 00:00:00 2001
From: Justin Payne <justin.payne@fda.hhs.gov>
Date: Thu, 21 May 2026 18:17:46 -0500
Subject: [PATCH 1/2] Add bioproject2srr Galaxy tool for retrieving SRR
 accessions from BioProjects

Retrieves SRR accessions and biosample metadata from NCBI BioProjects using
Entrez E-utilities API. Recursively follows links to subprojects and biosamples.

Outputs:
- accessions.txt: deduplicated list of SRR accessions
- metadata.tsv: full metadata table joining biosample attributes with SRA runs

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 tools/bioproject2srr/bioproject2srr/.shed.yml |   6 +
 .../bioproject2srr/bioproject2srr/bio2srr.py  | 276 ++++++++++++++++++
 .../bioproject2srr/bioproject2srr/bio2srr.xml |  31 ++
 .../bioproject2srr/job_conf.yml               |  34 +++
 .../bioproject2srr/test-data/accessions.txt   |   5 +
 .../bioproject2srr/test-data/metadata.tsv     |   8 +
 tools/bioproject2srr/bioproject2srr/tests.py  |  42 +++
 7 files changed, 402 insertions(+)
 create mode 100644 tools/bioproject2srr/bioproject2srr/.shed.yml
 create mode 100644 tools/bioproject2srr/bioproject2srr/bio2srr.py
 create mode 100644 tools/bioproject2srr/bioproject2srr/bio2srr.xml
 create mode 100644 tools/bioproject2srr/bioproject2srr/job_conf.yml
 create mode 100644 tools/bioproject2srr/bioproject2srr/test-data/accessions.txt
 create mode 100644 tools/bioproject2srr/bioproject2srr/test-data/metadata.tsv
 create mode 100644 tools/bioproject2srr/bioproject2srr/tests.py

diff --git a/tools/bioproject2srr/bioproject2srr/.shed.yml b/tools/bioproject2srr/bioproject2srr/.shed.yml
new file mode 100644
index 0000000..59f0183
--- /dev/null
+++ b/tools/bioproject2srr/bioproject2srr/.shed.yml
@@ -0,0 +1,6 @@
+categories: ['Tools']
+description: Give a Bioproject accession, get a list of associated SRR accessions
+name: bioproject2srr
+owner: jpayne
+ignore:
+- CLAUDE.md
diff --git a/tools/bioproject2srr/bioproject2srr/bio2srr.py b/tools/bioproject2srr/bioproject2srr/bio2srr.py
new file mode 100644
index 0000000..09a6456
--- /dev/null
+++ b/tools/bioproject2srr/bioproject2srr/bio2srr.py
@@ -0,0 +1,276 @@
+"Grab SRR numbers from Bioprojects and sub-bioprojects via Eutils"
+
+import requests
+import sys
+import csv
+import os
+
+try:
+    from itertools import batched
+except ImportError:
+    from itertools import islice
+    def batched(iterable, n):
+        "Batch data into tuples of length n. The last batch may be shorter."
+        # batched('ABCDEFG', 3) --> ABC DEF G
+        if n < 1:
+            raise ValueError('n must be at least one')
+        it = iter(iterable)
+        while batch := tuple(islice(it, n)):
+            yield batch
+from functools import cmp_to_key
+from time import sleep
+from xml.etree import ElementTree as xml
+
+esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+esummary = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
+elink = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
+
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+logger = logging.getLogger("bio2srr")
+
+extra_params = {}
+
+api_key = os.environ.get("NCBI_API_KEY")
+
+if api_key:
+    logger.info(f"Using NCBI API key {api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}")
+    extra_params["api_key"] = api_key
+
+def log(msg):
+    if api_key:
+        logger.info(msg.replace(api_key, f"{api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}")) # fix logging later
+    else:
+        logger.info(msg)
+
+def get_tag(root, tag):
+    val = root.find(tag)
+    if val is not None:
+        return val.text
+    log(f"No result for {tag}")
+
+
+
+def header_sort_override(a, b):
+    if a == b:
+        return 0
+    try:
+        for name in ["bioproject", "srr_accession", "biosample_accession", "organism", "taxid", "package",]:
+            if a == name:
+                return -1
+            if b == name:
+                return 1
+    except:
+        pass
+    if a < b:
+        return -1
+    else:
+        return 1
+
+hso = cmp_to_key(header_sort_override)
+
+def resolve_bioproject_ids_and_links(bioproject_id_list):
+    "Recursively follow bioproject and biosample links, yield biosample UID's and biosample XML"
+    for i, (bioproject, bioproject_id) in enumerate(bioproject_id_list):
+        log(f"Processing {bioproject} ({bioproject_id}) {i+1}/{len(bioproject_id_list)}")
+        #get bioproject to bioproject links
+        response = requests.get(elink, params=dict(db="bioproject", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params))
+        response.raise_for_status()
+        reply = response.json()
+        linksets = reply.get("linksets", [{}])[0].get("linksetdbs", [0,0,{}])
+        if len(linksets) >= 3:
+            for id in linksets[2].get("links", []): #third index is the up to down links
+                response = requests.get(esummary, params=dict(id=id, db="bioproject", format="json"))
+                response.raise_for_status()
+                replyy = response.json()
+                biop = replyy["result"][id]["project_acc"]
+                if id not in bioproject_id_list:
+                    bioproject_id_list.append((biop, id)) # recurse over bioproject links
+        # get bioproject to biosample links
+        response = requests.get(elink, params=dict(db="biosample", dbfrom="bioproject", id=bioproject_id, format="json", **extra_params))
+        response.raise_for_status()
+        reply = response.json()
+        links = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", [])
+        log(f"Found {len(links)} biosample links for {bioproject} ({bioproject_id})")
+        for ids in batched(links, 200):
+            response = requests.get(esummary, params=dict(id=",".join(ids), db="biosample", format="json"))
+            response.raise_for_status()
+            replyy = response.json()
+            for field, value in replyy.get("result", {}).items():
+                if "uids" not in field:
+                    yield bioproject, field, value["sampledata"] # this is XML, deleriously
+                    sleep(1 if not api_key else 0.1)
+
+
+biosample_example = """
+<BioSample access="public" publication_date="2020-12-21T00:00:00.000" last_update="2022-06-23T17:45:35.674" submission_date="2020-12-21T15:08:05.690" id="17131268" accession="SAMN17131268">
+    <Ids>     
+        <Id db="BioSample" is_primary="1">SAMN17131268</Id>
+        <Id db_label="Sample name">CJP19-D996</Id>
+    </Ids>
+    <Description>     
+        <Title>Pathogen: environmental/food/other sample from Campylobacter jejuni</Title>     
+        <Organism taxonomy_id="197" taxonomy_name="Campylobacter jejuni">       
+            <OrganismName>Campylobacter jejuni</OrganismName>
+        </Organism>   
+    </Description>   
+    <Owner>     
+        <Name url="http://www.fda.gov/Food/FoodScienceResearch/WholeGenomeSequencingProgramWGS/default.htm" abbreviation="CFSAN">FDA Center for Food Safety and Applied Nutrition</Name> 
+    </Owner> 
+    <Models>   
+        <Model>Pathogen.env</Model>  
+    </Models>  
+    <Package display_name="Pathogen: environmental/food/other; version 1.0">Pathogen.env.1.0</Package> 
+    <Attributes>  
+        <Attribute attribute_name="strain" harmonized_name="strain" display_name="strain">CJP19-D996</Attribute>  
+        <Attribute attribute_name="collection_date" harmonized_name="collection_date" display_name="collection date">missing</Attribute>     
+        <Attribute attribute_name="geo_loc_name" harmonized_name="geo_loc_name" display_name="geographic location">missing</Attribute>     
+        <Attribute attribute_name="collected_by" harmonized_name="collected_by" display_name="collected by">CDC</Attribute>     
+        <Attribute attribute_name="lat_lon" harmonized_name="lat_lon" display_name="latitude and longitude">missing</Attribute>     
+        <Attribute attribute_name="isolation_source" harmonized_name="isolation_source" display_name="isolation source">missing</Attribute>     
+        <Attribute attribute_name="isolate" harmonized_name="isolate" display_name="isolate">CFSAN091032</Attribute>     
+        <Attribute attribute_name="project name" harmonized_name="project_name" display_name="project name">GenomeTrakr</Attribute>     
+        <Attribute attribute_name="sequenced by" harmonized_name="sequenced_by" display_name="sequenced by">FDA Center for Food Safety and Applied Nutrition</Attribute>   
+    </Attributes>   
+    <Links>     
+        <Link type="entrez" target="bioproject" label="PRJNA681235">681235</Link>   
+    </Links>
+    <Status status="live" when="2020-12-21T15:08:05.693"/> 
+</BioSample>
+
+"""
+
+def flatten_biosample_xml(biosampxml):
+    root = xml.fromstring(biosampxml)
+    accession = get_tag(root, r'.//Id[@db="BioSample"]')
+    # sample_name = get_tag(root, r'.//Id[@db_label="Sample name"]')
+    organism = get_tag(root, r".//OrganismName")
+    tax_id = root.find(r".//Organism").attrib.get("taxonomy_id")
+    package = get_tag(root, r".//Package")
+    sampledict = dict(
+        biosample_accession=accession,
+        # sample_name=sample_name,
+        organism = organism,
+        taxid = tax_id,
+        package = package
+    )
+    for attribute in root.findall("Attributes/Attribute"):
+        sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text
+
+    return sampledict
+
+
+def yield_sra_runs_from_sample(biosample):
+    sleep(1 if not api_key else 0.1)
+    response = requests.get(elink, params=dict(id=biosample, dbfrom="biosample", db="sra", format="json", **extra_params))
+    response.raise_for_status()
+    reply = response.json()
+    for ids in batched(reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", []), 200):
+        sleep(1 if not api_key else 0.1)
+        response = requests.get(esummary, params=dict(id=','.join(ids), db="sra", format="json", **extra_params))
+        response.raise_for_status()
+        replyy = response.json()
+        for field, value in replyy.get("result", {}).items():
+            if "uids" not in field:
+                yield field, value.get("runs")
+
+
+runs_example = """
+<Run acc="SRR13167188" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/>  
+<Run acc="SRR13167189" total_spots="827691" total_bases="385043067" load_done="true" is_public="true" cluster_name="public" static_data_available="true"/>   
+"""
+
+def flatten_runs(runxml):
+    root = xml.fromstring(f"<data>{runxml}</data>") # gotta fix their garbage embedded XML since it isn't singly-rooted
+    for run in root.findall(".//Run"):
+        if run.attrib["is_public"] == "false":
+            logger.warning(f"Skipping non-public run {run.attrib['acc']}")
+        yield dict(
+            sra_run_accession = run.attrib["acc"],
+            total_spots = run.attrib["total_spots"],
+            total_bases = run.attrib["total_bases"],
+        )
+
+
+
+def main(starting_bioproject):
+    rows = []
+    response = requests.get(esearch, params=dict(db="bioproject", term=starting_bioproject, field="PRJA", format="json"))
+    response.raise_for_status()
+    reply = response.json()
+    try:
+        bioproject_id = reply["esearchresult"]["idlist"][0]
+        log(f"Found UID {bioproject_id} for '{starting_bioproject}'")
+    except IndexError:
+        logger.error(f"No results found for '{starting_bioproject}'. Error was \"{reply['esearchresult']['warninglist']['outputmessages']}\"")
+        sys.exit(1)
+    sleep(1 if not api_key else 0.1)
+    for bioproject, biosample, biosample_xml in resolve_bioproject_ids_and_links([(starting_bioproject, bioproject_id)]):
+        try:
+            sampledict = flatten_biosample_xml(biosample_xml)
+        except KeyError:
+            log(biosample_xml)
+            raise
+        sampledict["bioproject"] = bioproject
+        noruns = True
+        for sra, runs in yield_sra_runs_from_sample(biosample):
+            for run in flatten_runs(runs.strip()):
+                noruns = False
+                run.update(sampledict)
+                rows.append(run)
+        if noruns:
+            rows.append(sampledict)
+
+    log(f"Writing {len(rows)} rows to metadata.tsv")
+
+    header = set()
+    for row in rows:
+        for key in row.keys():
+            header.add(key)
+
+    header = sorted(list(header), key=hso)
+    # logger.info(f"Header: {header}")
+
+    rows.sort(key=lambda x: x["biosample_accession"])
+
+    with open("metadata.tsv", "w") as f:
+        writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel")
+        writer.writeheader()
+        writer.writerows(rows)
+
+    # check for duplicate runs and unreleased samples
+
+    accessions = [row.get("sra_run_accession") for row in rows if row.get("sra_run_accession")]
+
+    raw_length = len(accessions)
+
+    accessions = sorted(list(set(accessions)))
+
+    if raw_length < len(rows):
+        logger.warning(f"Bioproject {starting_bioproject} contains unreleased samples. {len(rows) - raw_length} samples will not be included in accessions.txt")
+
+    if len(accessions) < raw_length:
+        logger.warning(f"Some SRA runs may have been reached through multiple projects or samples. accessions.txt will be deduplicated but the metadata table is not")
+
+    log(f"Writing {len(accessions)} unique accessions to accessions.txt")
+
+    with open("accessions.txt", "w") as f:
+        for accession in accessions:
+            f.write(accession + "\n")
+
+
+if __name__ == "__main__":
+    b = sys.argv[1].strip()
+    log(f"Starting with {b}")
+    try:
+        main(b)
+    except requests.HTTPError as e:
+        logger.error(e)
+        sys.exit(1)
+
+                
+
+
+
diff --git a/tools/bioproject2srr/bioproject2srr/bio2srr.xml b/tools/bioproject2srr/bioproject2srr/bio2srr.xml
new file mode 100644
index 0000000..6f76446
--- /dev/null
+++ b/tools/bioproject2srr/bioproject2srr/bio2srr.xml
@@ -0,0 +1,31 @@
+<tool id="bio2srr" name="Bioproject to SRR" version="3.0.1">
+	<description>Retrieve SRR accessions and sample metadata from BioProject. Recursively follows links to subprojects.</description>
+    <requirements>
+        <container type="docker">docker.io/crashfrog/bp2srr-galaxy:latest</container>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        python /bio2srr.py "$input1"
+    ]]></command>
+    <inputs>
+        <param type="text" name="input1" label="BioProject" />
+    </inputs>
+    <outputs>
+    	<data format="txt" name="output" from_work_dir="accessions.txt" label="${input1} Accessions" />
+        <data format="tabular" name="metadata" from_work_dir="metadata.tsv" label="${input1} Metadata" />
+    </outputs>
+    <tests>
+    	<test>
+    		<param name="input1" value="PRJNA681235" />
+    		<output name="output" file="accessions.txt" />
+            <output name="metadata" file="metadata.tsv" />
+    	</test>
+        <test expect_failure="1">
+            <param name="input1" value="NOTHING" />
+        </test>
+    </tests>
+    <help><![CDATA[
+        Retrieve SRR accessions and sample metadata from a BioProject, including subprojects. It's a JOIN, so if a sample is associated with multiple SRA runs, then each run has a copy of the sample's metadata.
+    ]]></help>
+    <citations>
+    </citations>
+</tool>
\ No newline at end of file
diff --git a/tools/bioproject2srr/bioproject2srr/job_conf.yml b/tools/bioproject2srr/bioproject2srr/job_conf.yml
new file mode 100644
index 0000000..529dd53
--- /dev/null
+++ b/tools/bioproject2srr/bioproject2srr/job_conf.yml
@@ -0,0 +1,34 @@
+runners:
+  local:
+    load: galaxy.jobs.runners.local:LocalJobRunner
+    workers: 16
+
+# handling:
+#   processes:
+#     handler0:
+
+execution:
+  default: local
+  environments:
+    local:
+      runner: local
+    docker_local:
+      runner: local
+      docker_enabled: true
+      # container: "auto"
+      docker_volumes: $defaults
+      # docker_set_user: null
+      docker_run_extra_arguments: "--entrypoint ''"
+      docker_set_user: root
+
+tools:
+- id: bio2srr
+  # handler: handler0
+  environment: docker_local
+
+limits:
+-
+  # Amount of time a job can run (in any environment) before it
+  # will be terminated by Galaxy.
+  type: walltime
+  value: '01:00:00'
\ No newline at end of file
diff --git a/tools/bioproject2srr/bioproject2srr/test-data/accessions.txt b/tools/bioproject2srr/bioproject2srr/test-data/accessions.txt
new file mode 100644
index 0000000..94346ce
--- /dev/null
+++ b/tools/bioproject2srr/bioproject2srr/test-data/accessions.txt
@@ -0,0 +1,5 @@
+SRR13160357
+SRR13160358
+SRR13160359
+SRR13160360
+SRR13167188
diff --git a/tools/bioproject2srr/bioproject2srr/test-data/metadata.tsv b/tools/bioproject2srr/bioproject2srr/test-data/metadata.tsv
new file mode 100644
index 0000000..2bb4d4c
--- /dev/null
+++ b/tools/bioproject2srr/bioproject2srr/test-data/metadata.tsv
@@ -0,0 +1,8 @@
+bioproject	biosample_accession	organism	taxid	package	Genus	ProjectAccession	PublicAccession	Species	attribute_package	collected_by	collection_date	geo_loc_name	isolate	isolate_name_alias	isolation_source	lat_lon	project_name	sequenced_by	sra_run_accession	strain	total_bases	total_spots
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR13160357	ECP19-2498	312756765	660858
+PRJNA681235	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA		CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR13160358	ECP19-2498	327001270	704624
+PRJNA681235	SAMN16946946	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091027	coli	environmental/food/other	CDC	2019	USA		CFSAN091027	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR13160360	ECP19-598	316865532	683880
+PRJNA681235	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091028	coli	environmental/food/other	CDC	2019	USA		CFSAN091028	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR13160359	ECP19-798	473318585	1007158
+PRJNA681235	SAMN16956340	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091030	coli	environmental/food/other	CDC	2019	USA		CFSAN091030	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	SRR13167188	ECP19-198	385043067	827691
+PRJNA681235	SAMN17131267	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091031		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition		CJP19-D445		
+PRJNA681235	SAMN17131268	Campylobacter jejuni	197	Pathogen.env.1.0						CDC	missing	missing	CFSAN091032		missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition		CJP19-D996		
diff --git a/tools/bioproject2srr/bioproject2srr/tests.py b/tools/bioproject2srr/bioproject2srr/tests.py
new file mode 100644
index 0000000..a9a5e18
--- /dev/null
+++ b/tools/bioproject2srr/bioproject2srr/tests.py
@@ -0,0 +1,42 @@
+import pytest
+
+from bio2srr import *
+
+
+def test_element_tree_xpath():
+    from xml.etree import ElementTree as xml
+    root = xml.fromstring(biosample_example)
+    assert root.find(".//Id[@db='BioSample']") is not None
+
+def test_flatten_biosample_xml():
+    d = flatten_biosample_xml(biosample_example)
+    assert d['biosample_accession'] == 'SAMN17131268'
+    assert d['organism'] == 'Campylobacter jejuni'
+    assert d['isolate'] == 'CFSAN091032'
+
+def test_flatten_runs():
+    d = list(flatten_runs(runs_example))
+    assert len(d) == 2
+
+def test_header_sort_override_consistency():
+    import random
+    L = ["C", "B", "A", "taxid", "bioproject"]
+    L.sort(key=hso)
+    # assert L[0] == "bioproject"
+    A = L.copy()
+    assert A == L
+    R = []
+    for _ in range(100):
+        random.shuffle(A)
+        A.sort(key=hso)
+        R.append(A == L)
+    assert all(R)
+
+def test_hso_override():
+    assert header_sort_override("bioproject", "taxid") < 0
+    assert header_sort_override("taxid", "bioproject") > 0
+    assert header_sort_override("taxid", "taxid") == 0
+
+def test_hso_regular():
+    assert header_sort_override("A", "B") < 0
+    assert header_sort_override("B", "A") > 0
\ No newline at end of file

From 4f8e0a9ddd47d9d17469b7dba24b1c41f04e1c26 Mon Sep 17 00:00:00 2001
From: Justin Payne <justin.payne@fda.hhs.gov>
Date: Thu, 21 May 2026 18:18:10 -0500
Subject: [PATCH 2/2] Add srr2sam tool for reverse SRR to BioSample lookup and
 documentation

Adds complementary tool that retrieves BioSample metadata from a list of SRR
accessions. Performs reverse operation of bio2srr by traversing NCBI Entrez
from SRA to BioSample.

Output: tabular file with SRR accession as first column followed by all
BioSample metadata fields.

Also adds CLAUDE.md documentation covering both tools' architecture, testing,
and constraints.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 tools/bioproject2srr/bioproject2srr/CLAUDE.md |  98 ++++++++
 .../bioproject2srr/bioproject2srr/srr2sam.py  | 209 ++++++++++++++++++
 .../bioproject2srr/bioproject2srr/srr2sam.xml |  37 ++++
 .../test-data/biosample_metadata.tsv          |   4 +
 .../test-data/srr_test_input.txt              |   3 +
 5 files changed, 351 insertions(+)
 create mode 100644 tools/bioproject2srr/bioproject2srr/CLAUDE.md
 create mode 100644 tools/bioproject2srr/bioproject2srr/srr2sam.py
 create mode 100644 tools/bioproject2srr/bioproject2srr/srr2sam.xml
 create mode 100644 tools/bioproject2srr/bioproject2srr/test-data/biosample_metadata.tsv
 create mode 100644 tools/bioproject2srr/bioproject2srr/test-data/srr_test_input.txt

diff --git a/tools/bioproject2srr/bioproject2srr/CLAUDE.md b/tools/bioproject2srr/bioproject2srr/CLAUDE.md
new file mode 100644
index 0000000..49b9f2c
--- /dev/null
+++ b/tools/bioproject2srr/bioproject2srr/CLAUDE.md
@@ -0,0 +1,98 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Overview
+
+This directory contains two complementary Galaxy tools for working with NCBI metadata:
+
+### bio2srr (BioProject → SRR)
+Galaxy tool wrapper for `bio2srr.py`, which retrieves SRR (Sequence Read Archive) accessions and biosample metadata from NCBI BioProjects using the Entrez E-utilities API. The tool recursively follows links to subprojects and biosamples, producing two outputs:
+- `accessions.txt`: deduplicated list of SRR accessions
+- `metadata.tsv`: full metadata table joining biosample attributes with SRA run information
+
+### srr2sam (SRR → BioSample)
+Galaxy tool wrapper for `srr2sam.py`, which performs the reverse operation: given a list of SRR accessions, retrieves associated BioSample metadata. Produces:
+- `output.tsv`: tabular file with SRR accession as first column, followed by all BioSample metadata fields
+
+Both tools are containerized and run via Docker in Galaxy with the image `docker.io/crashfrog/bp2srr-galaxy:latest`.
+
+## Key Architecture
+
+### Core Logic Flow (bio2srr.py)
+1. **ID Resolution**: Convert BioProject accession → UID via esearch
+2. **Recursive Link Following**: `resolve_bioproject_ids_and_links()` walks BioProject→BioProject and BioProject→BioSample links
+3. **XML Parsing**: `flatten_biosample_xml()` extracts structured metadata from BioSample XML
+4. **SRA Mapping**: `yield_sra_runs_from_sample()` links BioSamples to SRA runs via elink
+5. **Output Generation**: Produces sorted, deduplicated TSV with custom header ordering
+
+### E-utilities API Pattern
+All NCBI API calls use `requests.get()` with:
+- Rate limiting: 1s delay (no API key) or 0.1s (with `NCBI_API_KEY`)
+- Batching: process IDs in groups of 200 for esummary calls
+- Error handling: `raise_for_status()` on all responses
+
+### Core Logic Flow (srr2sam.py)
+1. **SRR → SRA UID**: Convert SRR accession → SRA UID via esearch
+2. **SRA → BioSample Link**: Use elink to traverse from SRA to BioSample database
+3. **BioSample Metadata Fetch**: Retrieve BioSample XML via esummary
+4. **XML Parsing**: Reuse `flatten_biosample_xml()` from bio2srr.py
+5. **Output Generation**: TSV with SRR accession as first column
+
+### Header Ordering
+Both tools use `header_sort_override()` to enforce priority fields at the left:
+- **bio2srr**: `bioproject, srr_accession, biosample_accession, organism, taxid, package` then alphabetical
+- **srr2sam**: `sra_run_accession, biosample_accession, organism, taxid, package` then alphabetical
+
+## Testing
+
+**Run unit tests:**
+```bash
+pytest tests.py -v
+```
+
+**Run Galaxy tool tests:**
+Both tools include functional tests that are run by Galaxy's Planemo test framework. Test data is in `test-data/`.
+
+**bio2srr.xml:**
+- Input: `PRJNA681235` (BioProject accession)
+- Expected outputs: `accessions.txt`, `metadata.tsv`
+- Negative test: `NOTHING` (expects failure)
+
+**srr2sam.xml:**
+- Input: `srr_test_input.txt` (3 SRR accessions)
+- Expected output: `biosample_metadata.tsv`
+
+To run Galaxy tool tests locally:
+```bash
+planemo test --galaxy_python_version 3.11 bio2srr.xml
+planemo test --galaxy_python_version 3.11 srr2sam.xml
+```
+
+## Tool Shed Publishing
+
+This tool is published to the Galaxy ToolShed with metadata in `.shed.yml`:
+- Owner: jpayne
+- Category: Tools
+- Name: bioproject2srr
+
+## Dependencies
+
+The Python script uses:
+- `requests` for HTTP calls to NCBI E-utilities
+- `xml.etree.ElementTree` for parsing BioSample and SRA run XML
+- `itertools.batched` (or backport for Python <3.12)
+- Environment variable `NCBI_API_KEY` (optional, increases rate limit)
+
+## Important Constraints
+
+**bio2srr:**
+- Handles unreleased samples (samples without SRA runs) by including them in metadata but warning about omission from accessions list
+- Duplicate SRA runs (reached via multiple projects/samples) are deduplicated in `accessions.txt` but retained in metadata
+- Non-public SRA runs are skipped with a warning
+- API responses contain "deliriously" embedded XML fragments that require wrapping in `<data>` root tags before parsing
+
+**srr2sam:**
+- If an SRR accession cannot be resolved to a BioSample, the row will contain "NOT_FOUND" in the biosample_accession column
+- If BioSample XML parsing fails, the row will contain "PARSE_ERROR"
+- Takes the first BioSample UID when an SRA record links to multiple BioSamples (rare edge case)
diff --git a/tools/bioproject2srr/bioproject2srr/srr2sam.py b/tools/bioproject2srr/bioproject2srr/srr2sam.py
new file mode 100644
index 0000000..85ea442
--- /dev/null
+++ b/tools/bioproject2srr/bioproject2srr/srr2sam.py
@@ -0,0 +1,209 @@
+"Retrieve BioSample metadata from SRR accessions via Eutils"
+
+import requests
+import sys
+import csv
+import os
+
+try:
+    from itertools import batched
+except ImportError:
+    from itertools import islice
+    def batched(iterable, n):
+        "Batch data into tuples of length n. The last batch may be shorter."
+        if n < 1:
+            raise ValueError('n must be at least one')
+        it = iter(iterable)
+        while batch := tuple(islice(it, n)):
+            yield batch
+
+from functools import cmp_to_key
+from time import sleep
+from xml.etree import ElementTree as xml
+
+esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+esummary = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
+elink = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+logger = logging.getLogger("srr2sam")
+
+extra_params = {}
+
+api_key = os.environ.get("NCBI_API_KEY")
+
+if api_key:
+    logger.info(f"Using NCBI API key {api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}")
+    extra_params["api_key"] = api_key
+
+def log(msg):
+    if api_key:
+        logger.info(msg.replace(api_key, f"{api_key[:4]}{'*' * (len(api_key) - 8)}{api_key[-4:]}"))
+    else:
+        logger.info(msg)
+
+def get_tag(root, tag):
+    val = root.find(tag)
+    if val is not None:
+        return val.text
+    log(f"No result for {tag}")
+
+def header_sort_override(a, b):
+    if a == b:
+        return 0
+    try:
+        for name in ["sra_run_accession", "biosample_accession", "organism", "taxid", "package"]:
+            if a == name:
+                return -1
+            if b == name:
+                return 1
+    except:
+        pass
+    if a < b:
+        return -1
+    else:
+        return 1
+
+hso = cmp_to_key(header_sort_override)
+
+def flatten_biosample_xml(biosampxml):
+    "Parse BioSample XML and return dict of attributes"
+    root = xml.fromstring(biosampxml)
+    accession = get_tag(root, r'.//Id[@db="BioSample"]')
+    organism = get_tag(root, r".//OrganismName")
+    tax_id = root.find(r".//Organism").attrib.get("taxonomy_id")
+    package = get_tag(root, r".//Package")
+    sampledict = dict(
+        biosample_accession=accession,
+        organism=organism,
+        taxid=tax_id,
+        package=package
+    )
+    for attribute in root.findall("Attributes/Attribute"):
+        sampledict[attribute.attrib.get("harmonized_name", attribute.attrib['attribute_name'])] = attribute.text
+
+    return sampledict
+
+def srr_to_biosample_uid(srr_accession):
+    "Convert SRR accession to BioSample UID via Entrez"
+    sleep(1 if not api_key else 0.1)
+
+    # First convert SRR accession to SRA UID
+    response = requests.get(esearch, params=dict(db="sra", term=srr_accession, format="json", **extra_params))
+    response.raise_for_status()
+    reply = response.json()
+
+    try:
+        sra_uid = reply["esearchresult"]["idlist"][0]
+        log(f"Found SRA UID {sra_uid} for '{srr_accession}'")
+    except IndexError:
+        logger.warning(f"No SRA UID found for '{srr_accession}'")
+        return None
+
+    sleep(1 if not api_key else 0.1)
+
+    # Link from SRA to BioSample
+    response = requests.get(elink, params=dict(id=sra_uid, dbfrom="sra", db="biosample", format="json", **extra_params))
+    response.raise_for_status()
+    reply = response.json()
+
+    biosample_uids = reply.get("linksets", [{}])[0].get("linksetdbs", [{}])[0].get("links", [])
+
+    if not biosample_uids:
+        logger.warning(f"No BioSample link found for SRR {srr_accession} (UID {sra_uid})")
+        return None
+
+    return biosample_uids[0]  # Return first BioSample UID
+
+def get_biosample_metadata(biosample_uid):
+    "Fetch BioSample metadata XML via esummary"
+    sleep(1 if not api_key else 0.1)
+
+    response = requests.get(esummary, params=dict(id=biosample_uid, db="biosample", format="json", **extra_params))
+    response.raise_for_status()
+    reply = response.json()
+
+    biosample_xml = reply.get("result", {}).get(biosample_uid, {}).get("sampledata")
+
+    if not biosample_xml:
+        logger.warning(f"No metadata found for BioSample UID {biosample_uid}")
+        return None
+
+    return biosample_xml
+
+def main(input_file):
+    rows = []
+
+    # Read SRR accessions from input file
+    with open(input_file, 'r') as f:
+        srr_accessions = [line.strip() for line in f if line.strip()]
+
+    log(f"Processing {len(srr_accessions)} SRR accessions")
+
+    for i, srr in enumerate(srr_accessions, 1):
+        log(f"Processing {srr} ({i}/{len(srr_accessions)})")
+
+        # Get BioSample UID
+        biosample_uid = srr_to_biosample_uid(srr)
+        if not biosample_uid:
+            # Create minimal row for failed lookups
+            rows.append({"sra_run_accession": srr, "biosample_accession": "NOT_FOUND"})
+            continue
+
+        # Get BioSample metadata
+        biosample_xml = get_biosample_metadata(biosample_uid)
+        if not biosample_xml:
+            rows.append({"sra_run_accession": srr, "biosample_accession": "NOT_FOUND"})
+            continue
+
+        # Parse metadata
+        try:
+            metadata = flatten_biosample_xml(biosample_xml)
+            metadata["sra_run_accession"] = srr
+            rows.append(metadata)
+        except Exception as e:
+            logger.error(f"Error parsing BioSample metadata for {srr}: {e}")
+            rows.append({"sra_run_accession": srr, "biosample_accession": "PARSE_ERROR"})
+
+    log(f"Writing {len(rows)} rows to output.tsv")
+
+    # Collect all headers
+    header = set()
+    for row in rows:
+        for key in row.keys():
+            header.add(key)
+
+    header = sorted(list(header), key=hso)
+
+    # Sort by SRR accession
+    rows.sort(key=lambda x: x.get("sra_run_accession", ""))
+
+    # Write output
+    with open("output.tsv", "w") as f:
+        writer = csv.DictWriter(f, fieldnames=header, delimiter="\t", dialect="excel")
+        writer.writeheader()
+        writer.writerows(rows)
+
+    log("Complete")
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        logger.error("Usage: python srr2sam.py <input_file>")
+        sys.exit(1)
+
+    input_file = sys.argv[1]
+
+    if not os.path.exists(input_file):
+        logger.error(f"Input file not found: {input_file}")
+        sys.exit(1)
+
+    try:
+        main(input_file)
+    except requests.HTTPError as e:
+        logger.error(f"HTTP error: {e}")
+        sys.exit(1)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        sys.exit(1)
diff --git a/tools/bioproject2srr/bioproject2srr/srr2sam.xml b/tools/bioproject2srr/bioproject2srr/srr2sam.xml
new file mode 100644
index 0000000..b780e26
--- /dev/null
+++ b/tools/bioproject2srr/bioproject2srr/srr2sam.xml
@@ -0,0 +1,37 @@
+<tool id="srr2sam" name="SRR to BioSample" version="1.0.0">
+	<description>Retrieve BioSample metadata from SRR accessions. Traverses NCBI Entrez from SRA to BioSample.</description>
+    <requirements>
+        <container type="docker">docker.io/crashfrog/bp2srr-galaxy:latest</container>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        python '$__tool_directory__/srr2sam.py' '$input1'
+    ]]></command>
+    <inputs>
+        <param type="data" format="txt" name="input1" label="SRR Accessions File" help="Text file with one SRR accession per line" />
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" from_work_dir="output.tsv" label="BioSample Metadata from ${input1.name}" />
+    </outputs>
+    <tests>
+    	<test>
+    		<param name="input1" value="accessions.txt" />
+    		<output name="output" file="biosample_metadata.tsv" />
+    	</test>
+    </tests>
+    <help><![CDATA[
+        Retrieve BioSample metadata from a list of SRR accessions. For each SRR accession in the input file, the tool:
+
+        1. Queries NCBI SRA database to find the SRA UID
+        2. Links from SRA to BioSample using Entrez elink
+        3. Retrieves BioSample metadata including organism, taxonomic ID, sample attributes, and package type
+        4. Outputs a tabular file with SRR accession as the first column, followed by all BioSample metadata fields
+
+        **Input:** Text file with one SRR accession per line (e.g., output from Bioproject to SRR tool)
+
+        **Output:** Tabular file with BioSample metadata. Each row corresponds to one input SRR accession.
+
+        **Note:** If a BioSample cannot be found for an SRR accession, the row will contain "NOT_FOUND" in the biosample_accession column.
+    ]]></help>
+    <citations>
+    </citations>
+</tool>
diff --git a/tools/bioproject2srr/bioproject2srr/test-data/biosample_metadata.tsv b/tools/bioproject2srr/bioproject2srr/test-data/biosample_metadata.tsv
new file mode 100644
index 0000000..a7f1a79
--- /dev/null
+++ b/tools/bioproject2srr/bioproject2srr/test-data/biosample_metadata.tsv
@@ -0,0 +1,4 @@
+sra_run_accession	biosample_accession	organism	taxid	package	Genus	ProjectAccession	PublicAccession	Species	attribute_package	collected_by	collection_date	geo_loc_name	isolate_name_alias	isolation_source	lat_lon	project_name	sequenced_by	strain
+SRR13160357	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA	CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	ECP19-2498
+SRR13160358	SAMN16946945	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091029	coli	environmental/food/other	CDC	2019	USA	CFSAN091029	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	ECP19-2498
+SRR13160359	SAMN16946947	Escherichia coli O157:H7	83334	Pathogen.env.1.0	Escherichia	PRJNA681235	CFSAN091028	coli	environmental/food/other	CDC	2019	USA	CFSAN091028	missing	missing	GenomeTrakr	FDA Center for Food Safety and Applied Nutrition	ECP19-798
diff --git a/tools/bioproject2srr/bioproject2srr/test-data/srr_test_input.txt b/tools/bioproject2srr/bioproject2srr/test-data/srr_test_input.txt
new file mode 100644
index 0000000..aab78da
--- /dev/null
+++ b/tools/bioproject2srr/bioproject2srr/test-data/srr_test_input.txt
@@ -0,0 +1,3 @@
+SRR13160357
+SRR13160358
+SRR13160359