galaxyproject · nekrut · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026
diff --git a/tools/collapse_collection/.shed.yml b/tools/collapse_collection/.shed.yml
@@ -0,0 +1,10 @@
+categories: [Text Manipulation]
+description: Collapse a list collection into a single dataset, with options to keep a common header and prepend dataset names.
+long_description: |
+  Concatenates every file in a list collection into a single output dataset,
+  preserving collection order. Supports header deduplication and dataset name
+  prepending in several placement modes.
+name: collapse_collections
+owner: iuc
-owner: iuc
+owner: nml
-owner: iuc
+owner: nml
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/main/tools/collapse_collection
+homepage_url: https://github.com/galaxyproject/tools-iuc/tree/main/tools/collapse_collection
diff --git a/tools/collapse_collection/collapse_collection.xml b/tools/collapse_collection/collapse_collection.xml
@@ -0,0 +1,206 @@
+<tool id="collapse_dataset" name="Collapse Collection" version="5.2.0" profile="24.2">
+  <description>into single dataset in order of the collection</description>
+  <macros>
+    <import>macros.xml</import>
+  </macros>
+  <requirements>
+    <requirement type="package" version="5.1.0">gawk</requirement>
+  </requirements>
+  <command>
+    <![CDATA[
+
+    (
+    #if $one_header:
+      #if str($filename.add_name) == "true":
+        awk '{if (NR==1) {print "Sample\t"$0}}' "$input_list[0]";
+      #else:
+        awk '{if (NR==1) {print}}' "$input_list[0]";
+      #end if
+    #end if
+
+    #for $f in $input_list#
+    #if str($filename.add_name) == "true":
+       #if str($filename.place_name) ==  "same_once":
+         #if $one_header:
+           printf "$f.element_identifier\t"; tail -q -n +2 "$f";
+         #else:
+           printf "$f.element_identifier\t"; awk '{ print $0 } END { if (NR == 0 && NF == 0) { print "" } }' "$f";
+         #end if
+       #elif str($filename.place_name) ==  "same_multiple":
+         #if $one_header:
+           awk '{if (NR!=1) {print "$f.element_identifier\t"$0}}' "$f";
+         #else:
+           awk '{print "$f.element_identifier\t"$0} END { if (NR == 0 && NF == 0) { print "$f.element_identifier\t" } }' "$f";
+         #end if
+       #elif str($filename.place_name) ==  "above":
+         #if $one_header:
+           printf "$f.element_identifier\n"; tail -q -n +2  "$f";
+         #else:
+           printf "$f.element_identifier\n"; cat "$f";
+         #end if
+       #end if
+    #else:
+       #if $one_header:
+         awk '{if (NR!=1) {print}}' "$f";
+       #else:
+         cat "$f" ;
+       #end if 
+    #end if
+
+    #end for#
+    )
+    > $output
+
+    ]]>
+
+  </command>
+  <inputs>
+    <param name="input_list" type="data" format="data" label="Collection of files to collapse into single dataset" help="Select a list collection whose elements will be concatenated in order." optional="false" multiple="true" />
+    <param name="one_header" type="boolean" label="Keep one header line" help="Use the first line of the first file as a single header for the output. Enable this when every file shares the same header row."/>
+    <conditional name="filename">
+     <param name="add_name" type="select" label="Prepend dataset name" help="Add the element identifier of each dataset as a label or column in the output.">
+       <option value="false" selected="true">No</option>
+       <option value="true">Yes</option>
+     </param>
+     <when value="true">
+       <param name="place_name" type="select" label="Where to add dataset name" help="Controls how the element identifier is inserted into the output.">
+         <option value="same_once">As a new column, first row only</option>
+         <option value="same_multiple">As a new column, every row</option>
+         <option value="above">On a separate line above each dataset</option>
+       </param>
+     </when>
+     <when value="false">
+     </when>
+     </conditional>
+  </inputs>
+  <outputs>
+    <data name="output" format_source="input_list"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input_list" value="input1,input2"/>
+      <output name="output" file="answer.txt"/>
+    </test>
+    <test>
+      <param name="input_list" value="strain1.tsv,strain2.tsv"/>
+      <param name="one_header" value="True"/>
+      <param name="filename|add_name" value="true"/>
+      <param name="filename|place_name" value="same_multiple"/>
+      <output name="output" file="answer2.tsv"/>
+    </test>
+    <test>
+      <param name="input_list" value="strain1.tsv,strain2.tsv"/>
+      <param name="one_header" value="True"/>
+      <output name="output" file="answer3.tsv"/>
+    </test>
+
+  </tests>
+  <help><![CDATA[
+
+===========
+Description
+===========
+
+Concatenates every file in a list collection into a single output dataset, preserving the order in which they appear in the collection.
+
+When the files share a common header line (e.g. a TSV with column names), enable **Keep one header line** to emit the header only once at the top of the output instead of repeating it for every file. You can also prepend the element identifier (dataset name) to each record so that the source of every line is traceable in the merged output.
+
+The diagram below shows an example with header merging and dataset names prepended as a column on every row:
+
+@DIAGRAM@
+
+========
+Examples
+========
+
+**Basic collapse — simple concatenation**
+
+Two text files in a collection::
+
+ file1:
+   first file
+   second
+   third
+   fourth line
+
+ file2:
+   second file
+   second
+   third
+   fourth line
+
+Output::
+
+ first file
+ second
+ third
+ fourth line
+ second file
+ second
+ third
+ fourth line
+
+-------
+
+**Header merging with dataset name on every row**
+
+Two TSV files with a shared header, **Keep one header line** enabled, **Prepend dataset name** set to *As a new column, every row*::
+
+ strain1.tsv:
+   seq_name  median  mean  ...
+   mcr_1     52      52.74 ...
+   mcr_2     0       1.61  ...
+
+ strain2.tsv:
+   seq_name  median  mean  ...
+   mcr_1     85      85.62 ...
+   mcr_2     0       3.05  ...
+
+Output::
+
+ Sample       seq_name  median  mean  ...
+ strain1.tsv  mcr_1     52      52.74 ...
+ strain1.tsv  mcr_2     0       1.61  ...
+ strain2.tsv  mcr_1     85      85.62 ...
+ strain2.tsv  mcr_2     0       3.05  ...
+
+-------
+
+**Dataset name on a separate line above**
+
+Same two TSV files, **Keep one header line** enabled, **Prepend dataset name** set to *On a separate line above each dataset*::
+
+ seq_name  median  mean  ...
+ strain1.tsv
+ mcr_1     52      52.74 ...
+ mcr_2     0       1.61  ...
+ strain2.tsv
+ mcr_1     85      85.62 ...
+ mcr_2     0       3.05  ...
+
+-------
+
+**Header merging only (no dataset names)**
+
+Same two TSV files, **Keep one header line** enabled, **Prepend dataset name** disabled::
+
+ seq_name  median  mean  ...
+ mcr_1     52      52.74 ...
+ mcr_2     0       1.61  ...
+ mcr_1     85      85.62 ...
+ mcr_2     0       3.05  ...
+
+-------
+
+Originally developed by `Philip Mabon (Takadonet) <https://github.com/Takadonet>`_ at the National Microbiology Laboratory (PHAC).
+
+    ]]></help>
+  <citations>
+    <citation type="bibtex">@misc{phac_nml_galaxy_tools,
+      title={Galaxy Tools},
+      author={{Public Health Agency of Canada, National Microbiology Laboratory}},
+      url={https://github.com/phac-nml/galaxy_tools},
+      note={Original source repository}
+    }</citation>
+  </citations>
+</tool>
diff --git a/tools/collapse_collection/macros.xml b/tools/collapse_collection/macros.xml
@@ -0,0 +1,7 @@
+<macros>
+    <token name="@DIAGRAM@"><![CDATA[
+.. image:: $PATH_TO_IMAGES/collapse.png
+  :alt: Collapse a list collection into a single dataset
+  :width: 700
+    ]]></token>
+</macros>
diff --git a/tools/collapse_collection/static/images/collapse.png b/tools/collapse_collection/static/images/collapse.png
diff --git a/tools/collapse_collection/static/images/collapse.svg b/tools/collapse_collection/static/images/collapse.svg
diff --git a/tools/collapse_collection/test-data/answer.txt b/tools/collapse_collection/test-data/answer.txt
@@ -0,0 +1,8 @@
+first file
+second
+third
+fourth line
+second file
+second
+third
+fourth line
diff --git a/tools/collapse_collection/test-data/answer2.tsv b/tools/collapse_collection/test-data/answer2.tsv
@@ -0,0 +1,5 @@
+Sample	seq_name	median	mean	gc%	seq_length	invalid_bases	%_invalid	non_zero_bases	%_non_zero	%_non_zero_corrected
+strain1.tsv	mcr_1	52	52.74000	0.49139	1626	0	0.00000	1600	100.00000	100.00000
+strain1.tsv	mcr_2 	0	1.60905	0.48114	1617	0	0.00000	56	3.51980	3.51980
+strain2.tsv	mcr_1	85	85.61500	0.49139	1626	0	0.00000	1600	100.00000	100.00000
+strain2.tsv	mcr_2 	0	3.05343	0.48114	1617	0	0.00000	66	4.14833	4.14833
diff --git a/tools/collapse_collection/test-data/answer3.tsv b/tools/collapse_collection/test-data/answer3.tsv
@@ -0,0 +1,5 @@
+seq_name	median	mean	gc%	seq_length	invalid_bases	%_invalid	non_zero_bases	%_non_zero	%_non_zero_corrected
+mcr_1	52	52.74000	0.49139	1626	0	0.00000	1600	100.00000	100.00000
+mcr_2 	0	1.60905	0.48114	1617	0	0.00000	56	3.51980	3.51980
+mcr_1	85	85.61500	0.49139	1626	0	0.00000	1600	100.00000	100.00000
+mcr_2 	0	3.05343	0.48114	1617	0	0.00000	66	4.14833	4.14833
diff --git a/tools/collapse_collection/test-data/input1 b/tools/collapse_collection/test-data/input1
@@ -0,0 +1,4 @@
+first file
+second
+third
+fourth line
diff --git a/tools/collapse_collection/test-data/input2 b/tools/collapse_collection/test-data/input2
@@ -0,0 +1,4 @@
+second file
+second
+third
+fourth line
diff --git a/tools/collapse_collection/test-data/strain1.tsv b/tools/collapse_collection/test-data/strain1.tsv
@@ -0,0 +1,3 @@
+seq_name	median	mean	gc%	seq_length	invalid_bases	%_invalid	non_zero_bases	%_non_zero	%_non_zero_corrected
+mcr_1	52	52.74000	0.49139	1626	0	0.00000	1600	100.00000	100.00000
+mcr_2 	0	1.60905	0.48114	1617	0	0.00000	56	3.51980	3.51980
diff --git a/tools/collapse_collection/test-data/strain2.tsv b/tools/collapse_collection/test-data/strain2.tsv
@@ -0,0 +1,3 @@
+seq_name	median	mean	gc%	seq_length	invalid_bases	%_invalid	non_zero_bases	%_non_zero	%_non_zero_corrected
+mcr_1	85	85.61500	0.49139	1626	0	0.00000	1600	100.00000	100.00000
+mcr_2 	0	3.05343	0.48114	1617	0	0.00000	66	4.14833	4.14833