Skip to content

Commit

Permalink
Add files
Browse files Browse the repository at this point in the history
  • Loading branch information
gallardoalba committed Sep 6, 2023
1 parent ddcbfe5 commit f80c7c2
Show file tree
Hide file tree
Showing 8 changed files with 165 additions and 2 deletions.
11 changes: 11 additions & 0 deletions tools/genera/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
categories:
- Sequence Analysis
description: Estimates gene-family founder events
long_description: |
GenEra estimates gene-family founder events (i.e., the age of the last common ancestor of
protein-coding gene families) through the reimplementation of genomic phylostratigraphy.
name: genera
owner: bgruening
remote_repository_url: https://github.com/bgruening/galaxytools/tree/master/tools/genera
homepage_url: https://github.com/josuebarrera/GenEra
type: unrestricted
128 changes: 127 additions & 1 deletion tools/genera/genera.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,143 @@
</macros>
<expand macro='requirements' />
<command detect_errors='exit_code'><![CDATA[
#if $ref_db_source.db_source == "history":
ln -s $ref_db_source.reference_database './database.dmnd' &&
#else:
ln -s ${ref_db_source.index.fields.db_path} './database.dmnd' &&
#end if
mkdir -p './taxdump' &&
ln -s '${NCBI_taxdump}' 'taxdump.tar' &&
tar -xf 'taxdump.tar' -C './taxdump' &&
genERA
-n \${GALAXY_SLOTS:-4}
-q '${FASTA_input}'
-t $NCBI_taxonomy
-b './database.dmnd'
-d './taxdump'
#if $evolutionary_distances
-s '${evolutionary_distances}'
#end if
-x \${TMPDIR:-.}
-l $advanced_options.taxonomic_threshold
-e $advanced_options.diamond_threshold
-m $advanced_options.matches_threshold
-y $advanced_options.diamond_sensitivity
]]> </command>
<inputs>
<param argument="-q" name="FASTA_input" type="data" format="fasta" label="Protein sequences" help="File with query protein sequences in FASTA format
to perform pairwise sequence alignments using DIAMOND" />
<param argument="-t" name="NCBI_taxonomy" type="text" label="NCBI Taxonomy ID" help="NCBI Taxonomy ID of query species">
<sanitizer invalid_char="">
<valid initial="string.digits"/>
</sanitizer>
<validator type="regex">[0-9]+</validator>
</param>
<conditional name="ref_db_source">
<param name="db_source" argument="-b" type="select" label="DIAMOND reference database" help="Built-ins were indexed using default options">
<option value="indexed">Use a built-in index</option>
<option value="history">Use one from the history</option>
</param>
<when value="indexed">
<param name="index" type="select" label="Select a reference database" help="If your database of interest is not listed, contact your Galaxy admin">
<options from_data_table="diamond_database">
<filter type="sort_by" column="2"/>
<validator type="no_options" message="No indexes are available for the selected input dataset"/>
</options>
</param>
</when>
<when value="history">
<param name="reference_database" argument="--db" type="data" format="dmnd" label="Select the reference database" />
</when>
</conditional>
<param argument="-d" name="NCBI_taxdump" type="data" format="tar,gz,tgz" label="NCBI taxdump directory" help="The NCBI taxdump file is a database
file provided by the National Center for Biotechnology Information (NCBI) that contains the names and phylogenetic lineages of organisms with
molecular data in the NCBI databases"/>
<param argument="-s" name="evolutionary_distances" type="data" format="tabular" optional="true" label="Table with pairwise evolutionary distances" help="Table with pairwise
evolutionary distances (substitutions/site) between several species in the database and the query species (necessary to calculate homology
detection failure probabilities with abSENSE). NOTE: the query species SHOULD be included in this table" />
<section name="advanced_options" title="Advanced options">
<param argument="-l" name="taxonomic_threshold" type="integer" min="0" value="30" label="Taxonomic representativeness threshold" help=" Taxonomic
representativeness threshold below which a gene will be flagged as putative genome contamination or the product of a horizontal gene transfer
event. The threshold is established as 30% by default, but it can be freely modified by the user" />
<param argument="-e" name="diamond_threshold" type="float" min="0" max="1" value="0.00001" label="DIAMOND E-value threshold" help="The e-value
parameter of Diamond does not only control the output, it also affects the algorithm. It is used by some internal heuristics for ranking
hits before final alignments are computed so you can't expect to get exactly the same results" />
<param argument="-m" name="matches_threshold" type="integer" min="0" max="100" value="10" label="Matches percentage threshold" help="Minimum
percentage of matches between your query sequences and any species within a taxonomic level to consider it useful for the gene age
assignment (i.e., filtering taxonomic levels lacking complete genomes in the database)" />
<param argument="-y" name="diamond_sensitivity" type="select" label="DIAMOND sensitivity" help="By default, GenEra runs DIAMOND in sensitive
mode to retrieve the highest ammount of homologs in a reasonable amount of time. ">
<option value="fast">Fast</option>
<option value="mid-sensitive">Mid Sensitive</option>
<option value="sensitive" selected="true">Sensitive</option>
<option value="more-sensitive">More Sensitive</option>
<option value="very-sensitive">Very Sensitive</option>
<option value="ultra-sensitive">Ultra Sensitive</option>
</param>
</section>
</inputs>
<outputs>
<data name="gene_ages" format="tabular" from_work_dir="*_gene_ages.tsv" label="${tool.name} on ${on_string}: gene ages" />
<data name="gene_ages_summary" format="tabular" from_work_dir="*_gene_age_summary.tsv" label="${tool.name} on ${on_string}: gene ages summary" />
<data name="founder_events" format="tabular" from_work_dir="*_founder_events.tsv" label="${tool.name} on ${on_string}: founder events" />
<data name="founder_events_summary" format="tabular" label="${tool.name} on ${on_string}: founder events summary" />
<data name="HDF_gene_ages" format="tabular" from_work_dir="*_HDF_gene_ages.tsv" label="${tool.name} on ${on_string}: HDF gene ages">
<filter>evolutionary_distances</filter>
</data>
<data name="HDF_gene_ages_summary" format="tabular" from_work_dir="*_HDF_gene_ages_summary.tsv" label="${tool.name} on ${on_string}: HDF gene ages summary">
<filter>evolutionary_distances</filter>
</data>
<data name="HDF_founder_events" format="tabular" from_work_dir="*_HDF_founder_events.tsv" label="${tool.name} on ${on_string}: HDF founder events">
<filter>evolutionary_distances</filter>
</data>
<data name="HDF_founder_events_summary" format="tabular" from_work_dir="*_HDF_founder_events_summary.tsv" label="${tool.name} on ${on_string}: HDF founder events summary">
<filter>evolutionary_distances</filter>
</data>
</outputs>
<tests>
<test expect_num_outputs="4">
<param name="FASTA_input" value=""/>
<param name="NCBI_taxonomy" value=""/>
<param name="NCBI_taxdump" value=""/>
<param name="evolutionary_distances" value=""/>
<section name="advanced_options">
<param name="taxonomic_threshold" value="30"/>
<param name="diamond_threshold" value="0.00001"/>
<param name="matches_threshold" value="10"/>
<param name="diamond_sensitivity" value="fast"/>
</section>
<conditional name="ref_db_source">
<param name="db_source" value="indexed"/>
<param name="index" value=""/>
</conditional>
<output name="gene_ages" file="test01_gene_ages.tab" ftype="tabular"/>
<output name="gene_ages_summary" file="test01_gene_ages_summary.tab" ftype="tabular"/>
<output name="founder_events" file="test01_founder_events.tab" ftype="tabular"/>
<output name="founder_events_summary" file="test01_founder_events_summary.tab" ftype="tabular"/>
</test>
</tests>
<help><![CDATA[
Hello, world!
.. class:: infomark
**Purpose**
GenEra estimates gene-family founder events (i.e., the age of the last common ancestor of protein-coding gene families) through the reimplementation of genomic phylostratigraphy.
- GenEra takes advantage of DIAMOND speed and sensitivity to search for homolog genes throughout the entire NR database, and combines these results with the NCBI Taxonomy to assign an origination date for each gene and gene family in a query species.
- GenEra can also incorporate protein data from external sources to enrich the analysis, it can search for proteins within nucleotide data (i.e., genome/transcriptome assemblies) using MMseqs2 to improve the classification of orphan genes, and it calculates a taxonomic representativeness score to assess the reliability of assigning a gene to a specific age.
- Additionally, GenEra can calculate homology detection failure probabilities using abSENSE to help distinguish fast-evolving genes from high-confidence gene-family founder events.
----
.. class:: infomark
**Output files**
]]> </help>
<expand macro="citations"/>
Expand Down
2 changes: 1 addition & 1 deletion tools/genera/macros.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

<xml name="requirements">
<requirements>
<container type="docker">josuebarrera/genera:@TOOL_VERSION@</container>
<container type="docker">josuebarrera/genera:v@TOOL_VERSION@</container>
</requirements>
</xml>
<xml name="citations">
Expand Down
Binary file added tools/genera/test-data/db.dmnd
Binary file not shown.
1 change: 1 addition & 0 deletions tools/genera/test-data/diamond_database.loc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
testDb test_index ${__HERE__}/db.dmnd
10 changes: 10 additions & 0 deletions tools/genera/tool-data/diamond_database.loc.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#This is a sample file that enables the diamind to find the protein databases
#You will need to create these data files and then create
#a diamond_database.loc file similar to this one (store it in this directory)
#that points to the directories in which those files are stored.
#The diamond_database_indices.loc file has this format (longer white space characters are TAB characters):
#
#<unique_build_id> <display_name> <file_base_path>
#
#So, for example:
#ncbi_nr NCBI NR database (1-1-2015) /data/db/diamond/1-1-2015/nr.dmnd
8 changes: 8 additions & 0 deletions tools/genera/tool_data_table_conf.xml.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
<tables>
<!-- Locations of indexes in the Bowtie mapper format -->
<table name="diamond_database" comment_char="#">
<columns>value, name, db_path</columns>
<file path="tool-data/diamond_database.loc" />
</table>
</tables>
7 changes: 7 additions & 0 deletions tools/genera/tool_data_table_conf.xml.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<tables>
<!-- Locations of all fasta files required to build Diamond databases -->
<table name="diamond_database" comment_char="#">
<columns>value, name, db_path</columns>
<file path="${__HERE__}/test-data/diamond_database.loc" />
</table>
</tables>

0 comments on commit f80c7c2

Please sign in to comment.