@@ -259,6 +259,54 @@ def simplify_headers_drop(inputfile, keepfile, dropfile, base="contig_", drop=[]
259
259
return names
260
260
261
261
262
+ def filter_contigs_by_length (inputfile , outputfile , min_length = 10000 , base = "contig_" ):
263
+ """
264
+ Filter contigs by minimum length and simplify headers.
265
+
266
+ This function reads a FASTA file, filters contigs based on a minimum length threshold,
267
+ and writes the filtered contigs to an output file with simplified headers. This is
268
+ particularly useful for training ab initio gene predictors where short contigs
269
+ are not informative and can hurt training quality.
270
+
271
+ Args:
272
+ inputfile (str): Path to the input FASTA file.
273
+ outputfile (str): Path to the output file for filtered contigs.
274
+ min_length (int, optional): Minimum contig length to keep. Defaults to 10000.
275
+ base (str, optional): Base string for simplified headers. Defaults to "contig_".
276
+
277
+ Returns:
278
+ tuple: A tuple containing:
279
+ - dict: Mapping of simplified headers to original headers for kept contigs.
280
+ - int: Number of contigs kept.
281
+ - int: Number of contigs filtered out.
282
+ - int: Total length of kept contigs.
283
+ - int: Total length of filtered contigs.
284
+ """
285
+ names = {}
286
+ kept_count = 0
287
+ filtered_count = 0
288
+ kept_length = 0
289
+ filtered_length = 0
290
+
291
+ with open (outputfile , "w" ) as outfile :
292
+ for title , seq in pyfastx .Fasta (inputfile , build_index = False ):
293
+ seq_length = len (seq )
294
+
295
+ if seq_length >= min_length :
296
+ # Keep this contig
297
+ kept_count += 1
298
+ kept_length += seq_length
299
+ simplified_name = f"{ base } { kept_count } "
300
+ names [simplified_name ] = title
301
+ outfile .write (f">{ simplified_name } \n { softwrap (seq )} \n " )
302
+ else :
303
+ # Filter out this contig
304
+ filtered_count += 1
305
+ filtered_length += seq_length
306
+
307
+ return names , kept_count , filtered_count , kept_length , filtered_length
308
+
309
+
262
310
def list2groups (L ):
263
311
"""
264
312
Identify groups of continuous numbers in a list.
0 commit comments