Skip to content

Commit 7740d8d

Browse files
author
Jon Palmer
committed
min-contig for train #24; fix mem ab initio limit #22; --skip-predictors #22; bump version
1 parent 4bfec78 commit 7740d8d

File tree

6 files changed

+470
-111
lines changed

6 files changed

+470
-111
lines changed

CITATION.cff

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cff-version: version = "25.7.16"
1+
cff-version: version = "25.7.19"
22
title: 'funannotate2: eukaryotic genome annotation'
33
message: >-
44
If you use this software, please cite it using the
@@ -17,5 +17,5 @@ keywords:
1717
- functional annotation
1818
- consensus gene models
1919
license: BSD-2-Clause
20-
version: version = "25.7.16"
21-
date-released: '2025-07-16'
20+
version: version = "25.7.19"
21+
date-released: '2025-07-19'

funannotate2/__main__.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,13 @@ def predict_subparser(subparsers):
286286
help="Memory limit in GB to adjust CPU allocation",
287287
metavar="",
288288
)
289+
optional_args.add_argument(
290+
"--skip-predictors",
291+
nargs="+",
292+
choices=["snap", "augustus", "glimmerhmm", "genemark"],
293+
help="Skip specific ab initio predictors (choices: snap, augustus, glimmerhmm, genemark)",
294+
metavar="",
295+
)
289296
other_args = group.add_argument_group("Other arguments")
290297
other_args.add_argument(
291298
"-h",
@@ -349,6 +356,14 @@ def train_subparser(subparsers):
349356
help="Max length for fasta headers",
350357
metavar="",
351358
)
359+
optional_args.add_argument(
360+
"--min-contig-length",
361+
default=10000,
362+
dest="min_contig_length",
363+
type=int,
364+
help="Minimum contig length to use for training (default: 10000)",
365+
metavar="",
366+
)
352367
optional_args.add_argument(
353368
"--busco-lineage",
354369
dest="busco_lineage",

funannotate2/fastx.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,54 @@ def simplify_headers_drop(inputfile, keepfile, dropfile, base="contig_", drop=[]
259259
return names
260260

261261

262+
def filter_contigs_by_length(inputfile, outputfile, min_length=10000, base="contig_"):
263+
"""
264+
Filter contigs by minimum length and simplify headers.
265+
266+
This function reads a FASTA file, filters contigs based on a minimum length threshold,
267+
and writes the filtered contigs to an output file with simplified headers. This is
268+
particularly useful for training ab initio gene predictors where short contigs
269+
are not informative and can hurt training quality.
270+
271+
Args:
272+
inputfile (str): Path to the input FASTA file.
273+
outputfile (str): Path to the output file for filtered contigs.
274+
min_length (int, optional): Minimum contig length to keep. Defaults to 10000.
275+
base (str, optional): Base string for simplified headers. Defaults to "contig_".
276+
277+
Returns:
278+
tuple: A tuple containing:
279+
- dict: Mapping of simplified headers to original headers for kept contigs.
280+
- int: Number of contigs kept.
281+
- int: Number of contigs filtered out.
282+
- int: Total length of kept contigs.
283+
- int: Total length of filtered contigs.
284+
"""
285+
names = {}
286+
kept_count = 0
287+
filtered_count = 0
288+
kept_length = 0
289+
filtered_length = 0
290+
291+
with open(outputfile, "w") as outfile:
292+
for title, seq in pyfastx.Fasta(inputfile, build_index=False):
293+
seq_length = len(seq)
294+
295+
if seq_length >= min_length:
296+
# Keep this contig
297+
kept_count += 1
298+
kept_length += seq_length
299+
simplified_name = f"{base}{kept_count}"
300+
names[simplified_name] = title
301+
outfile.write(f">{simplified_name}\n{softwrap(seq)}\n")
302+
else:
303+
# Filter out this contig
304+
filtered_count += 1
305+
filtered_length += seq_length
306+
307+
return names, kept_count, filtered_count, kept_length, filtered_length
308+
309+
262310
def list2groups(L):
263311
"""
264312
Identify groups of continuous numbers in a list.

0 commit comments

Comments
 (0)