galaxyproject · Maed0x · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026 · Apr 17, 2026
diff --git a/tools/taffy/.shed.yml b/tools/taffy/.shed.yml
@@ -0,0 +1,23 @@
+name: Taffy
+owner: iuc
+description: A set of tools for manipulating TAF files
+long_description: |
+  The Transposed Alignment Format (TAF) structures multiple sequence alignments as a 
+  sequence of columns, assigning each its own line. With optional run-length encoding 
+  for bases, TAF minimizes file size and solves the data fragmentation issues common 
+  in block-based formats like MAF.
+homepage_url: https://github.com/ComparativeGenomicsToolkit/cactus
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/main/tools/taffy
+categories: 
+- Sequence Analysis
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  description_template: "Wrapper for Taffy suite: {{ tool_name }}"
+suite:
+  name: "suite_taffy"
+  description: "A set of tools for manipulating TAF files"
+  long_description: |
+    The Transposed Alignment Format (TAF) structures multiple sequence alignments as a 
+    sequence of columns, assigning each its own line. With optional run-length encoding 
+    for bases, TAF minimizes file size and solves the data fragmentation issues common 
+    in block-based formats like MAF.
diff --git a/tools/taffy/macros.xml b/tools/taffy/macros.xml
@@ -0,0 +1,144 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">taffy</requirement>
+        </requirements>
+    </xml>
+    <token name="@TOOL_VERSION@">0.0.3</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">25.1</token>
+    <token name="@SET_UNCOMPRESS_NAMES@"><![CDATA[
+        #set $uncompressed_input = 'uncompressed_input'
+        #set $uncompressed_input_tai = 'uncompressed_input.tai'
+        ]]>
+    </token>
+    <token name="@UNCOMPRESS_TAF_AND_MAF@"><![CDATA[
+        #if $input_file.is_of_type('maf.gz', 'paf.gz')
+            gunzip -c '$input_file' > '$uncompressed_input' &&
+        #else if $input_file.is_of_type('maf.bz2')
+            bzip2 -dc '$input_file' > '$uncompressed_input' &&
+        #else
+            ln -s '$input_file' '$uncompressed_input' &&
+        #end if
+        ]]>
+    </token>
+    <token name="@COMPRESS_TAF@"><![CDATA[
+        #if $compression_taf == 'gz':
+            --outputFile temp_file && 
+            gzip -c temp_file > '$out_file'
+        #else:
+            --outputFile '$out_file'
+        #end if
+        ]]>
+    </token>
+    <token name="@COMPRESS_MAF@"><![CDATA[
+        #if $compression_maf == 'gz':
+            --outputFile temp_file && 
+            gzip -c temp_file > '$out_file'
+        #else if $compression_maf == 'bz2':
+            --outputFile temp_file && 
+            bzip2 -c temp_file > '$out_file'
+        #else:
+            --outputFile '$out_file'
+        #end if
+        ]]>
+    </token>
+    <token name="@COMPRESS_PAF@"><![CDATA[
+        #if $compression_paf == 'gz':
+            --outputFile temp_file && 
+            gzip -c temp_file > '$out_file'
+        #else:
+            --outputFile '$out_file'
+        #end if
+        ]]>
+    </token>
+    <xml name="stdio">
+        <stdio>
+            <!-- Anything other than zero is an error -->
+            <exit_code range="1:"/>
+            <exit_code range=":-1"/>
+            <!-- In case the return code has not been set properly check stderr too -->
+            <regex source="stderr"  match="Error:"/>
+            <regex source="stderr"  match="Exception:"/>
+        </stdio>
+    </xml>
+    <xml name="input_taf_maf">
+        <param name="input_file" type="data" format="taf,taf.gz,maf,maf.gz,maf.bz2" label="TAF or MAF file"/>
+    </xml>
+    <xml name="input_taf">
+        <param name="input_file" type="data" format="taf,taf.gz" label="TAF file"/>
+    </xml>
+    <xml name="input_hal">
+        <param name="input_hal" type="data" format="hal" label="HAL file" help="HAL file for extracting gap sequence. Each input alignment sequence must match a genome in the HAL file based on its name prefix (e.g. Genome from Genome.chr1). If no matching genome is found, the tool aborts"/>
+    </xml>
+    <xml name="input_fasta">     
+        <param name="input_fasta" type="data" format="fasta,fasta.gz,fasta.bz2" multiple="true" label="FASTA file(s)" help="FASTA file(s) for extracting gap sequence"/>
+    </xml>
+    <xml name="input_index">
+        <param name="input_index" type="data" format="tai" label="Index file" help="Index file corresponding to the input alignment. This is required to allow fast access to coordinates. If missing, it can be generated with the 'Taffy index' tool"/>
+    </xml>
+    <xml name="params_colorBases">
+        <param argument="--colorBases" type="boolean" truevalue="--colorBases" falsevalue="" checked="false" label="Colorize bases" help="Adds color codes to the output for easier reading"/>  
+    </xml>
+    <xml name="params_repeatCoordinatesEveryNColumns">
+        <param argument="--repeatCoordinatesEveryNColumns" type="integer" min="1" value="10000" label="Repeat coordinates interval" help="Repeat coordinates of each sequence at least every n columns. Lower values allow for finer-grained indexing"/> 
+    </xml>
+    <xml name="params_conditional_compression_taf">		
+        <param name="compression_taf" type="select" label="Compress output">
+            <option value="none" selected="true">Don't compress output (default)</option>
+            <option value="gz">Compress output to .gz</option>
+        </param>
+    </xml>
+    <xml name="params_conditional_compression_maf">		
+        <param name="compression_maf" type="select" label="Compress output">
+            <option value="none" selected="true">Don't compress output (default)</option>
+            <option value="gz">Compress output to .gz</option>
+            <option value="bz2">Compress output to .bz2</option>
+        </param>
+    </xml>
+    <xml name="params_conditional_compression_paf">	
+        <param name="compression_paf" type="select" label="Compress output">
+            <option value="none" selected="true">Don't compress output (default)</option>
+            <option value="gz">Compress output to .gz</option>
+        </param>
+    </xml>
+    <xml name="sanitizer_default">	
+        <sanitizer invalid_char="">                
+            <valid initial="string.ascii_letters,string.digits,string.punctuation">
+                <add value=" "/>
+            </valid>
+        </sanitizer>
+    </xml>
+    <xml name="sanitizer_default_without_space">	
+        <sanitizer invalid_char="">                
+            <valid initial="string.ascii_letters,string.digits,string.punctuation"/>
+        </sanitizer>
+    </xml>
+    <xml name="validator_space_list">
+        <validator type="regex" message="Provide a space-separated list, without leading or trailing spaces">^[A-Za-z0-9._|:-]+( [A-Za-z0-9._|:-]+)*$</validator>
+    </xml>
+    <xml name="validator_trim">
+        <validator type="regex" message="Enter without leading or trailing spaces">^\S(?:.*\S)?$</validator>   
+    </xml>
+    <xml name="creator">
+        <creator>
+            <person givenName="Niklas" familyName="Mayle" url="https://github.com/Maed0x"/>
+            <person givenName="Saim" familyName="Momin" url="https://github.com/SaimMomin12"/>
+            <organization name="Galaxy Europe" url="https://galaxyproject.org/eu/"/>
+        </creator>
+    </xml>
+    <xml name="citation">
+        <citations>
+                <citation type="bibtex">
+                @misc{githubtaffy,
+                author = {Glenn Hickey, Benedict Paten},
+                year = {},
+                title = {taffy},
+                publisher = {GitHub},
+                journal = {GitHub repository},
+                url = {https://github.com/ComparativeGenomicsToolkit/taffy},
+                }</citation>
+                <citation type="doi">10.1038/s41586-020-2871-y</citation>
+        </citations>
+    </xml>
+</macros>
diff --git a/tools/taffy/taffy_add_gap_bases.xml b/tools/taffy/taffy_add_gap_bases.xml
@@ -0,0 +1,162 @@
+<tool id="taffy_add_gap_bases" name="Taffy add gap bases" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>fills interstitial gaps in a TAF file</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/> 
+    <expand macro="stdio"/>
+    <command detect_errors="aggressive">
+        <![CDATA[
+        ## Set variables
+        @SET_UNCOMPRESS_NAMES@
+
+        ## Uncompress input files if necessary and create symlinks for uncompressed files
+        @UNCOMPRESS_TAF_AND_MAF@
+
+        ## Uncompress fasta files if set
+        #if $gapFill.source == 'fasta':
+            #for $i, $fasta_file in enumerate($gapFill.input_fasta):
+                #set $temp_input = 'input_%s.fasta' % $i
+                #if $fasta_file.is_of_type('fasta.gz')
+                    gunzip -c '$fasta_file' > '$temp_input' &&
+                #else if $fasta_file.is_of_type('fasta.bz2')
+                    bzip2 -dc '$fasta_file' > '$temp_input' &&
+                #else
+                    ln -s '$fasta_file' '$temp_input' &&
+                #end if
+            #end for
+        #end if
+
+        ## Run main command
+        taffy add-gap-bases
+            #if $gapFill.source == 'hal':
+                --halFile '$gapFill.input_hal'
+            #elif $gapFill.source == 'fasta':
+                #for $i, $fasta_file in enumerate($gapFill.input_fasta):
+                    'input_${i}.fasta'
+                #end for
+            #end if
+            --logLevel 'INFO'
+            --inputFile '$uncompressed_input'
+            --maximumGapStringLength $maximumGapStringLength
+            --repeatCoordinatesEveryNColumns $repeatCoordinatesEveryNColumns
+
+        ## Compress TAF output if requested by user 
+        @COMPRESS_TAF@
+    ]]></command>
+    <inputs>
+        <expand macro="input_taf"/> 
+        <param argument="--maximumGapStringLength" type="integer" min="-1" value="50" label="Maximum gap string length to insert" help="Specify the maximum length of a gap to fill. Gaps longer than this will not be filled. A negative value (e.g., -1) means no limit"/>
+        <conditional name="gapFill">
+            <param name="source" type="select" label="Source of sequence to fill gaps">
+                <option value="" selected="true">Select the source of the sequences ...</option>
+                <option value="hal">Fill gaps via a HAL file</option>
+                <option value="fasta">Fill gaps via one or more FASTA files</option>
+                <validator type="empty_field" message="You must select a source of the sequences to fill gaps"/>
+            </param>
+            <when value=""/>
+            <when value="hal">    
+                <expand macro="input_hal"/>  
+            </when>
+            <when value="fasta">
+                <expand macro="input_fasta"/>  
+            </when>
+        </conditional>
+        <expand macro="params_repeatCoordinatesEveryNColumns"/>
+        <expand macro="params_conditional_compression_taf"/>
+    </inputs>
+    <outputs>
+        <data name="out_file" format="taf">
+            <change_format>
+                <when input="compression_taf" value="gz" format="taf.gz"/>
+            </change_format>
+        </data>
+    </outputs>
+    <tests>
+        <!-- Test 1: TAF input, Tool defaults with HAL as source -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="mr.taf"/>
+            <conditional name="gapFill">
+                <param name="source" value="hal"/>
+                <param name="input_hal" value="mr.hal"/>
+            </conditional>
+            <output name="out_file" ftype="taf">
+                <assert_contents>
+                    <has_text text="#taf" n="1"/>
+                    <has_text text=" ; g " negate="true"/>
+                    <has_line line="AAA ; G 0 G G 1 G G 2 G"/>
+                    <has_line line="AAA ; G 0 AT G 1 AA G 2 AA"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- Test 2: TAF input, Tool defaults with FASTA as source  -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="mr.taf"/>
+            <conditional name="gapFill">
+                <param name="source" value="fasta"/>
+                <param name="input_fasta" value="mr.mrrefChr1.fasta.gz"/>
+            </conditional>
+            <output name="out_file" ftype="taf">
+                <assert_contents>
+                    <has_text text="#taf" n="1"/>
+                    <has_line line="AAA ; G 0 G g 1 1 g 2 1"/>
+                    <has_line line="AAA ; G 0 TA g 1 2 g 2 2"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- Test 3: TAF input, HAL as source and lower maximum gap length -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="mr.taf"/>
+            <param name="maximumGapStringLength" value="1"/>
+            <conditional name="gapFill">
+                <param name="source" value="hal"/>
+                <param name="input_hal" value="mr.hal"/>
+            </conditional>
+            <output name="out_file" ftype="taf">
+                <assert_contents>
+                    <has_text text="#taf" n="1"/>
+                    <has_line line="AAA ; G 0 G G 1 G G 2 G"/>
+                    <has_line line="AAA ; G 0 AT G 1 AA G 2 AA" negate="true"/>
+                    <has_line line="AAA ; g 0 2 g 1 2 g 2 2"/>
+                </assert_contents>
+            </output>
+        </test>
+
+        <!-- Test 4: TAF input, HAL as source, compressed TAF output (.gz) -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="mr.taf"/>
+            <conditional name="gapFill">
+                <param name="source" value="hal"/>
+                <param name="input_hal" value="mr.hal"/>
+            </conditional>
+            <param name="compression_taf" value="gz"/>
+            <output name="out_file" ftype="taf.gz">
+                <assert_contents>
+                    <has_size size="282" delta="10"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+Taffy add gap bases fills the gaps between alignment blocks by inserting the actual nucleotide sequence. 
+This is done individually for each sequence in the alignment.
+To do so, the tool requires the corresponding sequence data from FASTA or HAL files. 
+The tool only fills the gaps between alignment blocks; it does not overwrite existing 'N' characters within a block.
+
+- **FASTA files:**
+  The sequences in the FASTA file must start at position 0 (whole chromosome/contig). 
+  Sub-sequences with offset coordinates are not supported.
+  The FASTA header must contain only the sequence name (e.g., ``>Genome.chr1``), without any additional description.
+
+- **HAL files:**
+  The HAL file must contain all genomes (species) present in the TAF file.
+  While it does not strictly need to contain every single sequence, missing sequences will result in gaps not being filled.
+
+When a gap is filled, the tool encodes this information with a 'G' (gap with sequence) tag by replacing the 'g' (gap) tag in the header line of the alignment block in the form: ``BASES ; G <POS> <FILLED_BASES>``
+
+The maximum length of inserted gap strings can be limited to prevent filling excessively large regions.
+Use this tool to produce alignments with explicit gap sequences, making them suitable for downstream analyses that require filled interstitial regions.
+    ]]></help>
+    <expand macro="citation"/>
+    <expand macro="creator"/>
+</tool>