Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 50 additions & 49 deletions tools/collection_column_join/collection_column_join.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
<tool id="collection_column_join" name="Column join" version="0.0.3">
<tool id="collection_column_join" name="Column join" version="0.0.4" profile="24.2">
<description>on multiple datasets</description>
<macros>
<import>macros.xml</import>
</macros>
<requirements>
<requirement type="package" version="8.25">coreutils</requirement>
</requirements>
Expand Down Expand Up @@ -54,12 +57,11 @@ cat header${ ( $i + 1 ) % 2 }.tmp output${ ( $i + 1 ) % 2 }.tmp > "${tabular_out
</configfile>
</configfiles>
<inputs>
<param name="input_tabular" type="data" format="tabular" multiple="True" optional="False" label="Tabular files"/>
<!-- <param name="identifier_column" type="data_column" data_ref="input_tabular" value="0" min="0" optional="False" label="Identifier column"/> -->
<param name="identifier_column" type="integer" value="1" min="0" optional="False" label="Identifier column" help="The column that will be used to join the input datasets"/>
<param name="has_header" type="integer" value="0" min="0" optional="False" label="Number of header lines in each input file" help="If this is set to 0, a header line will be added containing column names as follows: the identifier column will be named #KEY and the other columns are named by the input dataset names/columns. If you have one or more header lines in your input, set this to the number of header lines."/>
<param name="old_col_in_header" type="boolean" checked="true" label="Add column name to header" help="Disable if you want column headers to only be composed of the input file names, for example, if you want headers like file1 and not file1_column1, see Help section below. Default: Yes"/>
<param name="fill_char" type="text" value="." optional="False" label="Fill character" help="a placeholder for empty cells"/>
<param name="input_tabular" type="data" format="tabular" multiple="True" optional="False" label="Tabular files" help="Select multiple tabular datasets from a collection to join on a shared key column."/>
Comment thread
nekrut marked this conversation as resolved.
Outdated
<param name="identifier_column" type="integer" value="1" min="0" optional="False" label="Identifier column" help="Column number (1-based) containing the key used to match rows across datasets."/>
<param name="has_header" type="integer" value="0" min="0" optional="False" label="Number of header lines in each input file" help="Set to 0 to auto-generate headers from dataset names. Set to 1 or more if your files already have header lines."/>
<param name="old_col_in_header" type="boolean" checked="true" label="Add column name to header" help="When enabled, output headers are 'filename_column'. When disabled, headers are just 'filename'."/>
<param name="fill_char" type="text" value="." optional="False" label="Fill character" help="Placeholder for cells where a key exists in one file but not another."/>
<param name="include_outputs" type="select" multiple="True" label="Additional datasets to create">
<option value="output_shell_script" selected="false">Shell script</option>
</param>
Expand All @@ -71,93 +73,92 @@ cat header${ ( $i + 1 ) % 2 }.tmp output${ ( $i + 1 ) % 2 }.tmp > "${tabular_out
</data>
</outputs>
<tests>
<test>
<test expect_num_outputs="1">
<param name="input_tabular" value="in_1.tabular,in_2.tabular,in_3.tabular" ftype="tabular"/>
<param name="identifier_column" value="1"/>
<param name="has_header" value="1"/>
<param name="old_col_in_header" value="true"/>
<param name="fill_char" value="."/>
<param name="include_outputs" />

<output name="tabular_output" file="out_1.tabular" ftype="tabular"/>
</test>
<test>
<test expect_num_outputs="1">
<param name="input_tabular" value="in_1_headerless.tabular,in_2_headerless.tabular,in_3_headerless.tabular" ftype="tabular"/>
<param name="identifier_column" value="1"/>
<param name="has_header" value="0"/>
<param name="old_col_in_header" value="true"/>
<param name="fill_char" value="."/>
<param name="include_outputs" />

<output name="tabular_output" file="out_2.tabular" ftype="tabular"/>
</test>
<test>
<test expect_num_outputs="1">
<param name="input_tabular" value="in_1.tabular,in_2.tabular,in_3.tabular" ftype="tabular"/>
<param name="identifier_column" value="1"/>
<param name="has_header" value="1"/>
<param name="old_col_in_header" value="false"/>
<param name="fill_char" value="."/>
<param name="include_outputs" />

<output name="tabular_output" file="out_3.tabular" ftype="tabular"/>
</test>
<test>
<test expect_num_outputs="1">
<param name="input_tabular" value="in_1_headerless.tabular,in_2_headerless.tabular,in_3_headerless.tabular" ftype="tabular"/>
<param name="identifier_column" value="1"/>
<param name="has_header" value="0"/>
<param name="old_col_in_header" value="false"/>
<param name="fill_char" value="."/>
<param name="include_outputs" />

<output name="tabular_output" file="out_4.tabular" ftype="tabular"/>
</test>
</tests>
<help>
<![CDATA[
Joins lists of tabular datasets together on a field.

-----
<help><![CDATA[

**Example**
===========
Description
===========

To join three files, with headers, based on the first column:
Joins multiple tabular datasets side by side on a shared key column. Rows are matched by the value in the identifier column, and all remaining columns are combined into a single wide output table. Rows are sorted alphabetically by the key.

**First file (in_1)**::
When files have headers, the output header is built from dataset names and column names (e.g. ``in_1_c2``). If **Add column name to header** is disabled, headers use only the dataset name (e.g. ``in_1``). When there are no headers, a ``#KEY`` column and auto-generated names are used.

#KEY c2 c3 c4
one 1-1 1-2 1-3
two 1-4 1-5 1-6
three 1-7 1-8 1-9
If a key appears in some files but not others, the missing values are filled with the **Fill character** (default: ``.``).

@DIAGRAM@

**Second File (in_2)**::
========
Examples
========

#KEY c2 c3 c4
one 2-1 2-2 2-3
two 2-4 2-5 2-6
three 2-7 2-8 2-9
**Join with headers and column names**

**Third file (in_3)**::
Three files with partially overlapping keys, **identifier column** = 1, **header lines** = 1, **Add column name to header** enabled::

#KEY c2 c3 c4
one 3-3 3-2 3-3
two 3-4 3-5 3-6
three 3-7 3-8 3-9
in_1: in_2: in_3:
#KEY c2 c3 #KEY c2 c3 #KEY c2 c3
A 🍎 🍊 A 🐱 🐶 A ⭐ 🌙
B 🍇 🍓 B 🐸 🦊 C 🔥 💧
C 🥝 🍑 D 🐼 🐨 D ❄️ 🌸

Output (sorted by key, ``.`` = missing)::

**Joining** the files, using **identifier column of 1** and a **header lines of 1**, will return::
#KEY in_1_c2 in_1_c3 in_2_c2 in_2_c3 in_3_c2 in_3_c3
A 🍎 🍊 🐱 🐶 ⭐ 🌙
B 🍇 🍓 🐸 🦊 . .
C 🥝 🍑 . . 🔥 💧
D . . 🐼 🐨 ❄️ 🌸

#KEY in_1_c2 in_1_c3 in_1_c4 in_2_c2 in_2_c3 in_2_c4 in_3_c2 in_3_c3 in_3_c4
one 1-1 1-2 1-3 2-1 2-2 2-3 3-3 3-2 3-3
three 1-7 1-8 1-9 2-7 2-8 2-9 3-7 3-8 3-9
two 1-4 1-5 1-6 2-4 2-5 2-6 3-4 3-5 3-6
-------

**Join with headers but without column names**

**Joining** the files, using **identifier column of 1** and a **header lines of 1**, but disabling **Add column name to header**, will return::
Same files, but with **Add column name to header** disabled::

#KEY in_1 in_1 in_1 in_2 in_2 in_2 in_3 in_3 in_3
one 1-1 1-2 1-3 2-1 2-2 2-3 3-3 3-2 3-3
three 1-7 1-8 1-9 2-7 2-8 2-9 3-7 3-8 3-9
two 1-4 1-5 1-6 2-4 2-5 2-6 3-4 3-5 3-6
#KEY in_1 in_1 in_2 in_2 in_3 in_3
A 🍎 🍊 🐱 🐶 ⭐ 🌙
B 🍇 🍓 🐸 🦊 . .
C 🥝 🍑 . . 🔥 💧
D . . 🐼 🐨 ❄️ 🌸

]]>
</help>
]]></help>
<citations>
</citations>
</tool>
64 changes: 64 additions & 0 deletions tools/collection_column_join/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<macros>
<token name="@DIAGRAM@"><![CDATA[
.. raw:: html

<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 750 330" width="750" height="330">
<defs>
<filter id="shadow" x="-4%" y="-4%" width="110%" height="110%">
<feGaussianBlur in="SourceAlpha" stdDeviation="2" result="blur"/>
<feComponentTransfer in="blur"><feFuncA type="linear" slope="0.3"/></feComponentTransfer>
<feMerge><feMergeNode/><feMergeNode in="SourceGraphic"/></feMerge>
</filter>
<marker id="arrowhead" markerWidth="10" markerHeight="7" refX="9" refY="3.5" orient="auto">
<polygon points="0 0, 10 3.5, 0 7" fill="#555"/>
</marker>
</defs>
<!-- Left: collection with 3 tabular datasets -->
<rect x="10" y="10" width="195" height="270" rx="8" ry="8" fill="#d9ead3" stroke="#888" stroke-width="1" filter="url(#shadow)"/>
<text x="108" y="28" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="bold" fill="#333">Input: A collection</text>
<!-- in_1: keys A, B, C -->
<rect x="20" y="36" width="175" height="68" rx="6" ry="6" fill="#fff" stroke="#aaa" stroke-width="0.7"/>
<text x="108" y="50" text-anchor="middle" font-family="sans-serif" font-size="10" font-weight="bold" fill="#555">in_1</text>
<text y="62" font-family="monospace" font-size="8" fill="#888"><tspan x="28" font-weight="bold">KEY</tspan><tspan x="80">c2</tspan><tspan x="130">c3</tspan></text>
<text y="72" font-size="10"><tspan x="28" font-family="monospace" font-size="8" fill="#888">A</tspan><tspan x="80">🍎</tspan><tspan x="130">🍊</tspan></text>
<text y="82" font-size="10"><tspan x="28" font-family="monospace" font-size="8" fill="#888">B</tspan><tspan x="80">🍇</tspan><tspan x="130">🍓</tspan></text>
<text y="92" font-size="10"><tspan x="28" font-family="monospace" font-size="8" fill="#888">C</tspan><tspan x="80">🥝</tspan><tspan x="130">🍑</tspan></text>
<!-- in_2: keys A, B, D (missing C) -->
<rect x="20" y="112" width="175" height="68" rx="6" ry="6" fill="#fff" stroke="#aaa" stroke-width="0.7"/>
<text x="108" y="126" text-anchor="middle" font-family="sans-serif" font-size="10" font-weight="bold" fill="#555">in_2</text>
<text y="138" font-family="monospace" font-size="8" fill="#888"><tspan x="28" font-weight="bold">KEY</tspan><tspan x="80">c2</tspan><tspan x="130">c3</tspan></text>
<text y="148" font-size="10"><tspan x="28" font-family="monospace" font-size="8" fill="#888">A</tspan><tspan x="80">🐱</tspan><tspan x="130">🐶</tspan></text>
<text y="158" font-size="10"><tspan x="28" font-family="monospace" font-size="8" fill="#888">B</tspan><tspan x="80">🐸</tspan><tspan x="130">🦊</tspan></text>
<text y="168" font-size="10"><tspan x="28" font-family="monospace" font-size="8" fill="#888">D</tspan><tspan x="80">🐼</tspan><tspan x="130">🐨</tspan></text>
<!-- in_3: keys A, C, D (missing B) -->
<rect x="20" y="188" width="175" height="68" rx="6" ry="6" fill="#fff" stroke="#aaa" stroke-width="0.7"/>
<text x="108" y="202" text-anchor="middle" font-family="sans-serif" font-size="10" font-weight="bold" fill="#555">in_3</text>
<text y="214" font-family="monospace" font-size="8" fill="#888"><tspan x="28" font-weight="bold">KEY</tspan><tspan x="80">c2</tspan><tspan x="130">c3</tspan></text>
<text y="224" font-size="10"><tspan x="28" font-family="monospace" font-size="8" fill="#888">A</tspan><tspan x="80">⭐</tspan><tspan x="130">🌙</tspan></text>
<text y="234" font-size="10"><tspan x="28" font-family="monospace" font-size="8" fill="#888">C</tspan><tspan x="80">🔥</tspan><tspan x="130">💧</tspan></text>
<text y="244" font-size="10"><tspan x="28" font-family="monospace" font-size="8" fill="#888">D</tspan><tspan x="80">❄️</tspan><tspan x="130">🌸</tspan></text>
<!-- Arrow -->
<line x1="220" y1="145" x2="330" y2="145" stroke="#555" stroke-width="2" marker-end="url(#arrowhead)"/>
<text x="275" y="135" text-anchor="middle" font-family="sans-serif" font-size="11" font-weight="bold" fill="#555">Column Join</text>
<text x="275" y="163" text-anchor="middle" font-family="sans-serif" font-size="9" fill="#888">on KEY column</text>
<!-- Right: joined output -->
<rect x="345" y="55" width="380" height="155" rx="8" ry="8" fill="#fff" stroke="#aaa" stroke-width="1" filter="url(#shadow)"/>
<text x="535" y="73" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="bold" fill="#333">Output: A single dataset</text>
<!-- Header -->
<text y="90" font-family="monospace" font-size="7" font-weight="bold" fill="#333"><tspan x="358">KEY</tspan><tspan x="400">in_1_c2</tspan><tspan x="448">in_1_c3</tspan><tspan x="500">in_2_c2</tspan><tspan x="548">in_2_c3</tspan><tspan x="600">in_3_c2</tspan><tspan x="648">in_3_c3</tspan></text>
<line x1="355" y1="94" x2="715" y2="94" stroke="#ddd" stroke-width="0.5"/>
<!-- A: all present -->
<text y="108" font-size="10"><tspan x="358" font-family="monospace" font-size="8" fill="#555">A</tspan><tspan x="410">🍎</tspan><tspan x="460">🍊</tspan><tspan x="510">🐱</tspan><tspan x="558">🐶</tspan><tspan x="610">⭐</tspan><tspan x="658">🌙</tspan></text>
<!-- B: in_3 missing -->
<text y="124" font-size="10"><tspan x="358" font-family="monospace" font-size="8" fill="#555">B</tspan><tspan x="410">🍇</tspan><tspan x="460">🍓</tspan><tspan x="510">🐸</tspan><tspan x="558">🦊</tspan><tspan x="612" font-family="monospace" font-size="8" fill="#bbb">.</tspan><tspan x="662" font-family="monospace" font-size="8" fill="#bbb">.</tspan></text>
<!-- C: in_2 missing -->
<text y="140" font-size="10"><tspan x="358" font-family="monospace" font-size="8" fill="#555">C</tspan><tspan x="410">🥝</tspan><tspan x="460">🍑</tspan><tspan x="512" font-family="monospace" font-size="8" fill="#bbb">.</tspan><tspan x="562" font-family="monospace" font-size="8" fill="#bbb">.</tspan><tspan x="610">🔥</tspan><tspan x="658">💧</tspan></text>
<!-- D: in_1 missing -->
<text y="156" font-size="10"><tspan x="358" font-family="monospace" font-size="8" fill="#555">D</tspan><tspan x="412" font-family="monospace" font-size="8" fill="#bbb">.</tspan><tspan x="462" font-family="monospace" font-size="8" fill="#bbb">.</tspan><tspan x="510">🐼</tspan><tspan x="558">🐨</tspan><tspan x="610">❄️</tspan><tspan x="658">🌸</tspan></text>
<!-- Legend -->
<text x="535" y="200" text-anchor="middle" font-family="monospace" font-size="8" fill="#bbb">. = fill character (missing key)</text>
<!-- Annotation -->
<text x="375" y="323" text-anchor="middle" font-family="sans-serif" font-size="10" font-style="italic" fill="#666">rows joined on key column; missing values filled with placeholder</text>
</svg>
]]></token>
</macros>
Loading