Skip to content

Commit 0c1c0e2

Browse files
authored
Merge pull request #7510 from d-callan/trim-stops
add trim terminal stops tool
2 parents 7352c3e + 927b2ff commit 0c1c0e2

File tree

8 files changed

+315
-0
lines changed

8 files changed

+315
-0
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
name: remove_terminal_stop_codons
2+
owner: iuc
3+
categories:
4+
- Phylogenetics
5+
- Sequence Analysis
6+
- Fasta Manipulation
7+
description: Remove terminal stop codons from coding sequences
8+
homepage_url: https://github.com/veg/CAPHEINE
9+
long_description: |
10+
Remove terminal stop codons from coding sequences in a FASTA file.
11+
This tool is designed as a preprocessing step for downstream analysis tools
12+
like cawlign and HyPhy that do not permit stop codons in their input sequences.
13+
It uses NCBI genetic codes to correctly identify stop codons for different organisms.
14+
remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/main/tools/remove_terminal_stop_codons
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Remove terminal stop codons from coding sequences.
4+
5+
Trim all terminal stop codons from sequences in a FASTA file, using a chosen
6+
NCBI genetic code (translation table). Leave non-stop terminal codons alone.
7+
If any INTERNAL, in-frame stop codon is found, exit with an error.
8+
9+
This tool is designed as a preprocessing step for tools like cawlign and HyPhy
10+
that do not permit internal stop codons in their input sequences.
11+
12+
Requires: Biopython
13+
"""
14+
15+
import argparse
16+
import sys
17+
18+
from Bio import SeqIO
19+
from Bio.Data import CodonTable
20+
from Bio.Seq import Seq
21+
22+
23+
def load_table(table_arg):
24+
"""Return a DNA codon table from an NCBI table id (int) or name (str)."""
25+
if table_arg is None:
26+
return CodonTable.unambiguous_dna_by_id[1] # Standard
27+
# try as integer id
28+
try:
29+
tid = int(table_arg)
30+
return CodonTable.unambiguous_dna_by_id[tid]
31+
except (ValueError, KeyError):
32+
pass
33+
# try as name
34+
try:
35+
return CodonTable.unambiguous_dna_by_name[table_arg]
36+
except KeyError:
37+
# Build a helpful hint list
38+
valid_ids = sorted(CodonTable.unambiguous_dna_by_id.keys())
39+
valid_names = sorted(CodonTable.unambiguous_dna_by_name.keys())
40+
sys.stderr.write(
41+
f"ERROR: Unknown genetic code '{table_arg}'.\n"
42+
f"Try an NCBI table id (e.g., 1) or one of these names:\n"
43+
f" {', '.join(valid_names)}\n"
44+
f"(Valid ids include: {', '.join(map(str, valid_ids))})\n"
45+
)
46+
sys.exit(2)
47+
48+
49+
def trim_terminal_stops_and_validate(record, stop_codons, check_internal=True):
50+
"""
51+
Remove ALL trailing stop codons (0+ at the end).
52+
53+
If check_internal is True and any internal in-frame stop codon exists
54+
(excluding the trailing block), exit with an error message.
55+
56+
Ignore a terminal codon that is not a stop codon.
57+
"""
58+
# Work with DNA letters; treat any RNA U as T
59+
seq_str = str(record.seq).upper().replace("U", "T")
60+
61+
# Count how many full codons sit at the end that are stops
62+
idx = len(seq_str)
63+
trailing_stops = 0
64+
while idx >= 3:
65+
codon = seq_str[idx - 3:idx]
66+
if codon in stop_codons:
67+
trailing_stops += 1
68+
idx -= 3
69+
else:
70+
break
71+
72+
# Scan for INTERNAL stops: all complete codons up to (but not including)
73+
# the trailing stop block (and ignoring any trailing partial codon).
74+
if check_internal:
75+
scan_end = (idx // 3) * 3 # only complete codons
76+
for pos in range(0, scan_end, 3):
77+
codon = seq_str[pos:pos + 3]
78+
if codon in stop_codons:
79+
sys.stderr.write(
80+
f"ERROR: Found an internal stop codon in sequence "
81+
f"'{record.id}' at position {pos}.\n"
82+
f"Tools like HyPhy and cawlign do not permit internal "
83+
f"stop codons. Please review your input sequences.\n"
84+
)
85+
sys.exit(2)
86+
87+
# Finally, remove the trailing stop codons (if any)
88+
if trailing_stops > 0:
89+
seq_str = seq_str[:idx]
90+
91+
# Leave sequences with non-stop terminal codons unchanged by design
92+
return Seq(seq_str)
93+
94+
95+
def main():
96+
ap = argparse.ArgumentParser(
97+
description="Remove all terminal stop codons from a FASTA, using a "
98+
"chosen genetic code. Optionally fail if any internal "
99+
"in-frame stop codon is present."
100+
)
101+
ap.add_argument("-i", "--input", required=True, help="Input FASTA file")
102+
ap.add_argument("-o", "--output", required=True, help="Output FASTA file")
103+
ap.add_argument(
104+
"-t", "--table",
105+
help="NCBI translation table id (e.g., 1) or name "
106+
"(e.g., 'Vertebrate Mitochondrial'). Default: 1 (Standard)."
107+
)
108+
ap.add_argument(
109+
"--no-check-internal",
110+
action="store_true",
111+
help="Do not check for internal stop codons (only remove terminal)."
112+
)
113+
args = ap.parse_args()
114+
115+
table = load_table(args.table)
116+
stop_codons = set(table.stop_codons) # e.g., {'TAA','TAG','TGA'} for Standard
117+
118+
check_internal = not args.no_check_internal
119+
120+
records_out = []
121+
for rec in SeqIO.parse(args.input, "fasta"):
122+
new_seq = trim_terminal_stops_and_validate(rec, stop_codons, check_internal)
123+
rec.seq = new_seq
124+
records_out.append(rec)
125+
126+
SeqIO.write(records_out, args.output, "fasta")
127+
128+
129+
if __name__ == "__main__":
130+
main()
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
<tool id="remove_terminal_stop_codons" name="Remove terminal stop codons" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2+
<description>from coding sequences</description>
3+
<macros>
4+
<token name="@TOOL_VERSION@">1.0.0</token>
5+
<token name="@VERSION_SUFFIX@">0</token>
6+
<token name="@PROFILE@">25.0</token>
7+
</macros>
8+
<requirements>
9+
<requirement type="package" version="1.84">biopython</requirement>
10+
</requirements>
11+
<required_files>
12+
<include path="remove_terminal_stop_codons.py" />
13+
</required_files>
14+
<command detect_errors="exit_code"><![CDATA[
15+
python '$__tool_directory__/remove_terminal_stop_codons.py'
16+
-i '$input'
17+
-o '$output'
18+
#if str($genetic_code) != "1"
19+
-t '$genetic_code'
20+
#end if
21+
$no_check_internal
22+
]]></command>
23+
<inputs>
24+
<param name="input" type="data" format="fasta" label="Input FASTA file"
25+
help="FASTA file containing coding sequences (CDS) to process." />
26+
<param name="genetic_code" type="select" label="Genetic code"
27+
help="NCBI translation table to use for identifying stop codons. Different organisms use different genetic codes.">
28+
<option value="1" selected="true">1 - Standard</option>
29+
<option value="2">2 - Vertebrate Mitochondrial</option>
30+
<option value="3">3 - Yeast Mitochondrial</option>
31+
<option value="4">4 - Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma</option>
32+
<option value="5">5 - Invertebrate Mitochondrial</option>
33+
<option value="6">6 - Ciliate, Dasycladacean and Hexamita Nuclear</option>
34+
<option value="9">9 - Echinoderm and Flatworm Mitochondrial</option>
35+
<option value="10">10 - Euplotid Nuclear</option>
36+
<option value="11">11 - Bacterial, Archaeal and Plant Plastid</option>
37+
<option value="12">12 - Alternative Yeast Nuclear</option>
38+
<option value="13">13 - Ascidian Mitochondrial</option>
39+
<option value="14">14 - Alternative Flatworm Mitochondrial</option>
40+
<option value="15">15 - Blepharisma Nuclear</option>
41+
<option value="16">16 - Chlorophycean Mitochondrial</option>
42+
<option value="21">21 - Trematode Mitochondrial</option>
43+
<option value="22">22 - Scenedesmus obliquus Mitochondrial</option>
44+
<option value="23">23 - Thraustochytrium Mitochondrial</option>
45+
<option value="24">24 - Rhabdopleuridae Mitochondrial</option>
46+
<option value="25">25 - Candidate Division SR1 and Gracilibacteria</option>
47+
<option value="26">26 - Pachysolen tannophilus Nuclear</option>
48+
<option value="27">27 - Karyorelict Nuclear</option>
49+
<option value="28">28 - Condylostoma Nuclear</option>
50+
<option value="29">29 - Mesodinium Nuclear</option>
51+
<option value="30">30 - Peritrich Nuclear</option>
52+
<option value="31">31 - Blastocrithidia Nuclear</option>
53+
<option value="33">33 - Cephalodiscidae Mitochondrial UAA-Tyr</option>
54+
</param>
55+
<param argument="--no-check-internal" type="boolean" truevalue="--no-check-internal" falsevalue=""
56+
checked="false" label="Skip internal stop codon check"
57+
help="By default, the tool will fail if internal (in-frame) stop codons are found. Enable this to only remove terminal stops without checking for internal ones." />
58+
</inputs>
59+
<outputs>
60+
<data name="output" format="fasta" label="${tool.name} on ${on_string}" />
61+
</outputs>
62+
<tests>
63+
<!-- Test 1: Basic removal of terminal stop codon -->
64+
<test expect_num_outputs="1">
65+
<param name="input" value="with_terminal_stop.fasta" ftype="fasta" />
66+
<param name="genetic_code" value="1" />
67+
<output name="output" file="without_terminal_stop.fasta" ftype="fasta" />
68+
</test>
69+
<!-- Test 2: Sequence without terminal stop (should pass through unchanged) -->
70+
<test expect_num_outputs="1">
71+
<param name="input" value="no_terminal_stop.fasta" ftype="fasta" />
72+
<param name="genetic_code" value="1" />
73+
<output name="output" file="no_terminal_stop.fasta" ftype="fasta" />
74+
</test>
75+
<!-- Test 3: Internal stop codon should fail -->
76+
<test expect_failure="true">
77+
<param name="input" value="with_internal_stop.fasta" ftype="fasta" />
78+
<param name="genetic_code" value="1" />
79+
</test>
80+
<!-- Test 4: Internal stop with skip check should pass -->
81+
<test expect_num_outputs="1">
82+
<param name="input" value="with_internal_stop.fasta" ftype="fasta" />
83+
<param name="genetic_code" value="1" />
84+
<param name="no_check_internal" value="true" />
85+
<output name="output" file="with_internal_stop_output.fasta" ftype="fasta" />
86+
</test>
87+
</tests>
88+
<help><![CDATA[
89+
**What it does**
90+
91+
This tool removes terminal (trailing) stop codons from coding sequences in a FASTA file.
92+
It is designed as a **preprocessing step** for downstream tools like **cawlign** and **HyPhy**
93+
that do not permit stop codons in their input sequences.
94+
95+
**Important**: By default, this tool will **fail with an error** if it detects any internal
96+
(in-frame) stop codons in your sequences. This is intentional, but can be disabled with the
97+
`--no-check-internal` option.
98+
99+
----
100+
101+
**Input**
102+
103+
A FASTA file containing coding sequences (CDS). Sequences should be:
104+
105+
- In the correct reading frame (starting at position 1 of a codon)
106+
- DNA sequences (RNA sequences with U will be converted to T)
107+
108+
----
109+
110+
**Output**
111+
112+
A FASTA file with terminal stop codons removed. The output preserves:
113+
114+
- Sequence identifiers and descriptions
115+
- Sequences that don't end with stop codons (passed through unchanged)
116+
- Partial codons at the end (not removed)
117+
118+
----
119+
120+
**Genetic Codes**
121+
122+
Different organisms use different genetic codes (translation tables) which define
123+
which codons are stop codons:
124+
125+
- **Standard (1)**: TAA, TAG, TGA - used by most organisms
126+
- **Vertebrate Mitochondrial (2)**: TAA, TAG, AGA, AGG - mitochondria of vertebrates
127+
- **Bacterial/Archaeal (11)**: TAA, TAG, TGA - bacteria and archaea
128+
129+
Select the appropriate genetic code for your organism to ensure correct stop codon identification.
130+
131+
----
132+
133+
**Use Cases**
134+
135+
1. **Before cawlign**: Remove terminal stops from sequences before codon-aware alignment
136+
2. **Before HyPhy**: Prepare sequences for selection analysis (HyPhy methods like BUSTED, FEL, MEME)
137+
3. **Quality control**: Identify sequences with internal stop codons that may need review
138+
]]></help>
139+
<citations>
140+
<citation type="bibtex">
141+
@misc{capheine2025,
142+
author = {Callan, Danielle and Verdonk, Hannah and Kosakovsky Pond, Sergei L.},
143+
title = {CAPHEINE: A Comprehensive Automated Pipeline Using HyPhy for Evolutionary Inference with Nextflow},
144+
year = {2025},
145+
publisher = {GitHub},
146+
url = {https://github.com/veg/CAPHEINE},
147+
note = {Terminal stop-codon removal logic in this Galaxy tool is adapted from the CAPHEINE pipeline.}
148+
}
149+
</citation>
150+
</citations>
151+
</tool>
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
>seq1 sequence without terminal stop
2+
ATGAAACCCGGGAAA
3+
>seq2 another sequence without terminal stop
4+
ATGCCCAAAGGGCCC
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
>seq1 sequence with internal TAA stop codon
2+
ATGTAACCCGGGTAA
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
>seq1 sequence with internal TAA stop codon
2+
ATGTAACCCGGG
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
>seq1 test sequence with terminal TAA stop
2+
ATGAAACCCGGGTAA
3+
>seq2 test sequence with terminal TAG stop
4+
ATGCCCAAAGGGCCCAAATAG
5+
>seq3 test sequence with terminal TGA stop
6+
ATGGGGTTTAAACCCGGGTGA
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
>seq1 test sequence with terminal TAA stop
2+
ATGAAACCCGGG
3+
>seq2 test sequence with terminal TAG stop
4+
ATGCCCAAAGGGCCCAAA
5+
>seq3 test sequence with terminal TGA stop
6+
ATGGGGTTTAAACCCGGG

0 commit comments

Comments
 (0)