Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions tools/taffy/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: Taffy
Comment thread
bgruening marked this conversation as resolved.
Outdated
owner: iuc
description: A set of tools for manipulating TAF files
long_description: |
The Transposed Alignment Format (TAF) structures multiple sequence alignments as a
sequence of columns, assigning each its own line. With optional run-length encoding
for bases, TAF minimizes file size and solves the data fragmentation issues common
in block-based formats like MAF.
homepage_url: https://github.com/ComparativeGenomicsToolkit/cactus
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/main/tools/taffy
categories:
- Sequence Analysis
auto_tool_repositories:
name_template: "{{ tool_id }}"
description_template: "Wrapper for Taffy suite: {{ tool_name }}"
suite:
name: "suite_taffy"
description: "A set of tools for manipulating TAF files"
long_description: |
The Transposed Alignment Format (TAF) structures multiple sequence alignments as a
sequence of columns, assigning each its own line. With optional run-length encoding
for bases, TAF minimizes file size and solves the data fragmentation issues common
in block-based formats like MAF.
144 changes: 144 additions & 0 deletions tools/taffy/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
<macros>
<xml name="requirements">
<requirements>
<requirement type="package" version="@TOOL_VERSION@">taffy</requirement>
</requirements>
</xml>
<token name="@TOOL_VERSION@">0.0.3</token>
<token name="@VERSION_SUFFIX@">0</token>
<token name="@PROFILE@">25.1</token>
<token name="@SET_UNCOMPRESS_NAMES@"><![CDATA[
#set $uncompressed_input = 'uncompressed_input'
#set $uncompressed_input_tai = 'uncompressed_input.tai'
]]>
</token>
<token name="@UNCOMPRESS_TAF_AND_MAF@"><![CDATA[
#if $input_file.is_of_type('maf.gz', 'paf.gz')
gunzip -c '$input_file' > '$uncompressed_input' &&
#else if $input_file.is_of_type('maf.bz2')
bzip2 -dc '$input_file' > '$uncompressed_input' &&
#else
ln -s '$input_file' '$uncompressed_input' &&
#end if
]]>
</token>
<token name="@COMPRESS_TAF@"><![CDATA[
#if $compression_taf == 'gz':
--outputFile temp_file &&
gzip -c temp_file > '$out_file'
#else:
--outputFile '$out_file'
#end if
]]>
</token>
<token name="@COMPRESS_MAF@"><![CDATA[
#if $compression_maf == 'gz':
--outputFile temp_file &&
gzip -c temp_file > '$out_file'
#else if $compression_maf == 'bz2':
--outputFile temp_file &&
bzip2 -c temp_file > '$out_file'
#else:
--outputFile '$out_file'
#end if
]]>
</token>
<token name="@COMPRESS_PAF@"><![CDATA[
#if $compression_paf == 'gz':
--outputFile temp_file &&
gzip -c temp_file > '$out_file'
#else:
--outputFile '$out_file'
#end if
]]>
</token>
<xml name="stdio">
<stdio>
<!-- Anything other than zero is an error -->
<exit_code range="1:"/>
<exit_code range=":-1"/>
<!-- In case the return code has not been set properly check stderr too -->
<regex source="stderr" match="Error:"/>
<regex source="stderr" match="Exception:"/>
</stdio>
</xml>
<xml name="input_taf_maf">
<param name="input_file" type="data" format="taf,taf.gz,maf,maf.gz,maf.bz2" label="TAF or MAF file"/>
</xml>
<xml name="input_taf">
<param name="input_file" type="data" format="taf,taf.gz" label="TAF file"/>
</xml>
<xml name="input_hal">
<param name="input_hal" type="data" format="hal" label="HAL file" help="HAL file for extracting gap sequence. Each input alignment sequence must match a genome in the HAL file based on its name prefix (e.g. Genome from Genome.chr1). If no matching genome is found, the tool aborts"/>
</xml>
<xml name="input_fasta">
<param name="input_fasta" type="data" format="fasta,fasta.gz,fasta.bz2" multiple="true" label="FASTA file(s)" help="FASTA file(s) for extracting gap sequence"/>
</xml>
<xml name="input_index">
<param name="input_index" type="data" format="tai" label="Index file" help="Index file corresponding to the input alignment. This is required to allow fast access to coordinates. If missing, it can be generated with the 'Taffy index' tool"/>
</xml>
Comment thread
SaimMomin12 marked this conversation as resolved.
Outdated
<xml name="params_colorBases">
<param argument="--colorBases" type="boolean" truevalue="--colorBases" falsevalue="" checked="false" label="Colorize bases" help="Adds color codes to the output for easier reading"/>
</xml>
<xml name="params_repeatCoordinatesEveryNColumns">
<param argument="--repeatCoordinatesEveryNColumns" type="integer" min="1" value="10000" label="Repeat coordinates interval" help="Repeat coordinates of each sequence at least every n columns. Lower values allow for finer-grained indexing"/>
</xml>
<xml name="params_conditional_compression_taf">
<param name="compression_taf" type="select" label="Compress output">
<option value="none" selected="true">Don't compress output (default)</option>
<option value="gz">Compress output to .gz</option>
</param>
</xml>
<xml name="params_conditional_compression_maf">
<param name="compression_maf" type="select" label="Compress output">
<option value="none" selected="true">Don't compress output (default)</option>
<option value="gz">Compress output to .gz</option>
<option value="bz2">Compress output to .bz2</option>
</param>
</xml>
<xml name="params_conditional_compression_paf">
<param name="compression_paf" type="select" label="Compress output">
<option value="none" selected="true">Don't compress output (default)</option>
<option value="gz">Compress output to .gz</option>
</param>
</xml>
<xml name="sanitizer_default">
<sanitizer invalid_char="">
<valid initial="string.ascii_letters,string.digits,string.punctuation">
<add value=" "/>
</valid>
</sanitizer>
</xml>
<xml name="sanitizer_default_without_space">
<sanitizer invalid_char="">
<valid initial="string.ascii_letters,string.digits,string.punctuation"/>
</sanitizer>
</xml>
<xml name="validator_space_list">
<validator type="regex" message="Provide a space-separated list, without leading or trailing spaces">^[A-Za-z0-9._|:-]+( [A-Za-z0-9._|:-]+)*$</validator>
</xml>
<xml name="validator_trim">
<validator type="regex" message="Enter without leading or trailing spaces">^\S(?:.*\S)?$</validator>
</xml>
<xml name="creator">
<creator>
<person givenName="Niklas" familyName="Mayle" url="https://github.com/Maed0x"/>
<person givenName="Saim" familyName="Momin" url="https://github.com/SaimMomin12"/>
<organization name="Galaxy Europe" url="https://galaxyproject.org/eu/"/>
</creator>
</xml>
<xml name="citation">
<citations>
<citation type="bibtex">
@misc{githubtaffy,
author = {Glenn Hickey, Benedict Paten},
year = {},
title = {taffy},
publisher = {GitHub},
journal = {GitHub repository},
url = {https://github.com/ComparativeGenomicsToolkit/taffy},
}</citation>
<citation type="doi">10.1038/s41586-020-2871-y</citation>
</citations>
</xml>
</macros>
162 changes: 162 additions & 0 deletions tools/taffy/taffy_add_gap_bases.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
<tool id="taffy_add_gap_bases" name="Taffy add gap bases" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>fills interstitial gaps in a TAF file</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="requirements"/>
<expand macro="stdio"/>
<command detect_errors="aggressive">
<![CDATA[
## Set variables
@SET_UNCOMPRESS_NAMES@

## Uncompress input files if necessary and create symlinks for uncompressed files
@UNCOMPRESS_TAF_AND_MAF@

## Uncompress fasta files if set
#if $gapFill.source == 'fasta':
#for $i, $fasta_file in enumerate($gapFill.input_fasta):
#set $temp_input = 'input_%s.fasta' % $i
#if $fasta_file.is_of_type('fasta.gz')
gunzip -c '$fasta_file' > '$temp_input' &&
#else if $fasta_file.is_of_type('fasta.bz2')
bzip2 -dc '$fasta_file' > '$temp_input' &&
#else
ln -s '$fasta_file' '$temp_input' &&
#end if
#end for
#end if

## Run main command
taffy add-gap-bases
#if $gapFill.source == 'hal':
--halFile '$gapFill.input_hal'
#elif $gapFill.source == 'fasta':
#for $i, $fasta_file in enumerate($gapFill.input_fasta):
'input_${i}.fasta'
#end for
#end if
--logLevel 'INFO'
--inputFile '$uncompressed_input'
--maximumGapStringLength $maximumGapStringLength
--repeatCoordinatesEveryNColumns $repeatCoordinatesEveryNColumns

## Compress TAF output if requested by user
@COMPRESS_TAF@
]]></command>
<inputs>
<expand macro="input_taf"/>
<param argument="--maximumGapStringLength" type="integer" min="-1" value="50" label="Maximum gap string length to insert" help="Specify the maximum length of a gap to fill. Gaps longer than this will not be filled. A negative value (e.g., -1) means no limit"/>
<conditional name="gapFill">
<param name="source" type="select" label="Source of sequence to fill gaps">
<option value="" selected="true">Select the source of the sequences ...</option>
<option value="hal">Fill gaps via a HAL file</option>
<option value="fasta">Fill gaps via one or more FASTA files</option>
<validator type="empty_field" message="You must select a source of the sequences to fill gaps"/>
</param>
<when value=""/>
<when value="hal">
<expand macro="input_hal"/>
</when>
<when value="fasta">
<expand macro="input_fasta"/>
</when>
</conditional>
<expand macro="params_repeatCoordinatesEveryNColumns"/>
<expand macro="params_conditional_compression_taf"/>
</inputs>
<outputs>
<data name="out_file" format="taf">
<change_format>
<when input="compression_taf" value="gz" format="taf.gz"/>
</change_format>
</data>
</outputs>
<tests>
<!-- Test 1: TAF input, Tool defaults with HAL as source -->
<test expect_num_outputs="1">
<param name="input_file" value="mr.taf"/>
<conditional name="gapFill">
<param name="source" value="hal"/>
<param name="input_hal" value="mr.hal"/>
</conditional>
<output name="out_file" ftype="taf">
<assert_contents>
<has_text text="#taf" n="1"/>
<has_text text=" ; g " negate="true"/>
<has_line line="AAA ; G 0 G G 1 G G 2 G"/>
<has_line line="AAA ; G 0 AT G 1 AA G 2 AA"/>
</assert_contents>
</output>
</test>
<!-- Test 2: TAF input, Tool defaults with FASTA as source -->
<test expect_num_outputs="1">
<param name="input_file" value="mr.taf"/>
<conditional name="gapFill">
<param name="source" value="fasta"/>
<param name="input_fasta" value="mr.mrrefChr1.fasta.gz"/>
</conditional>
<output name="out_file" ftype="taf">
<assert_contents>
<has_text text="#taf" n="1"/>
<has_line line="AAA ; G 0 G g 1 1 g 2 1"/>
<has_line line="AAA ; G 0 TA g 1 2 g 2 2"/>
</assert_contents>
</output>
</test>
<!-- Test 3: TAF input, HAL as source and lower maximum gap length -->
<test expect_num_outputs="1">
<param name="input_file" value="mr.taf"/>
<param name="maximumGapStringLength" value="1"/>
<conditional name="gapFill">
<param name="source" value="hal"/>
<param name="input_hal" value="mr.hal"/>
</conditional>
<output name="out_file" ftype="taf">
<assert_contents>
<has_text text="#taf" n="1"/>
<has_line line="AAA ; G 0 G G 1 G G 2 G"/>
<has_line line="AAA ; G 0 AT G 1 AA G 2 AA" negate="true"/>
<has_line line="AAA ; g 0 2 g 1 2 g 2 2"/>
</assert_contents>
</output>
</test>

<!-- Test 4: TAF input, HAL as source, compressed TAF output (.gz) -->
<test expect_num_outputs="1">
<param name="input_file" value="mr.taf"/>
<conditional name="gapFill">
<param name="source" value="hal"/>
<param name="input_hal" value="mr.hal"/>
</conditional>
<param name="compression_taf" value="gz"/>
<output name="out_file" ftype="taf.gz">
<assert_contents>
<has_size size="282" delta="10"/>
</assert_contents>
</output>
</test>
</tests>
<help><![CDATA[
Taffy add gap bases fills the gaps between alignment blocks by inserting the actual nucleotide sequence.
This is done individually for each sequence in the alignment.
To do so, the tool requires the corresponding sequence data from FASTA or HAL files.
The tool only fills the gaps between alignment blocks; it does not overwrite existing 'N' characters within a block.

- **FASTA files:**
The sequences in the FASTA file must start at position 0 (whole chromosome/contig).
Sub-sequences with offset coordinates are not supported.
The FASTA header must contain only the sequence name (e.g., ``>Genome.chr1``), without any additional description.

- **HAL files:**
The HAL file must contain all genomes (species) present in the TAF file.
While it does not strictly need to contain every single sequence, missing sequences will result in gaps not being filled.

When a gap is filled, the tool encodes this information with a 'G' (gap with sequence) tag by replacing the 'g' (gap) tag in the header line of the alignment block in the form: ``BASES ; G <POS> <FILLED_BASES>``

The maximum length of inserted gap strings can be limited to prevent filling excessively large regions.
Use this tool to produce alignments with explicit gap sequences, making them suitable for downstream analyses that require filled interstitial regions.
]]></help>
<expand macro="citation"/>
<expand macro="creator"/>
</tool>
Loading
Loading