-
Notifications
You must be signed in to change notification settings - Fork 295
/
uniqify-sequences.py
executable file
·67 lines (52 loc) · 1.79 KB
/
uniqify-sequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#! /usr/bin/env python2
#
# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2013. It is licensed under
# the three-clause BSD license; see doc/LICENSE.txt.
# Contact: khmer-project@idyll.org
#
import khmer
import sys
import screed
K = 20
HASHTABLE_SIZE = int(2.5e8)
N_HT = 4
UNIQUE_LEN = 100
UNIQUE_F = 0.9
OUTPUT_WINDOW = 100
OUTPUT_OVERLAP = 10
def main():
kh = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
discarded = 0
kept_kmers = 0
total_kmers = 0
total_out = 0
for filename in sys.argv[1:]:
n_out = 0
for n, record in enumerate(screed.open(filename)):
if n > 0 and n % 10000 == 0:
print >>sys.stderr, '...', n, discarded
print >>sys.stderr, '==>', total_kmers, kept_kmers, int(
float(kept_kmers) / float(total_kmers) * 100.)
seq = record.sequence
seq = seq.replace('N', 'G')
paths = kh.extract_unique_paths(seq, UNIQUE_LEN, UNIQUE_F)
kh.consume(seq)
total_kmers += len(seq) - K + 1
if not len(paths):
discarded += 1
continue
for i, path in enumerate(paths):
n_out += 1
if len(path) < OUTPUT_WINDOW:
total_out += 1
print '>%d\n%s' % (total_out, path)
continue
for start in range(0, len(path) - OUTPUT_WINDOW + 1,
OUTPUT_OVERLAP):
total_out += 1
subpath = path[start:start + OUTPUT_WINDOW]
print '>%d\n%s' % (total_out, subpath)
print >>sys.stderr, '%d for %s' % (n_out, filename)
if __name__ == '__main__':
main()