-
Notifications
You must be signed in to change notification settings - Fork 295
/
count-within-radius.py
executable file
·60 lines (44 loc) · 1.32 KB
/
count-within-radius.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#! /usr/bin/env python2
#
# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
# Copyright (C) Michigan State University, 2009-2013. It is licensed under
# the three-clause BSD license; see doc/LICENSE.txt.
# Contact: khmer-project@idyll.org
#
import sys
import screed.fasta
import os
import khmer
K = 32
HASHTABLE_SIZE = int(8e9)
N_HT = 4
RADIUS = 100
###
MAX_DENSITY = 2000
def main():
infile = sys.argv[1]
outfile = sys.argv[2]
if len(sys.argv) > 3:
RADIUS = int(sys.argv[3])
print 'saving to:', outfile
print 'making hashtable'
ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
print 'eating', infile
ht.consume_fasta(infile)
print 'loading'
ht.save(outfile + '.ht')
outfp = open(outfile, 'w')
for n, record in enumerate(screed.open(infile)):
if n % 10000 == 0:
print '... saving', n
seq = record['sequence']
for pos in range(0, len(seq), 200):
subseq = seq[pos:pos + 200]
middle = (len(subseq) - K + 1) / 2
density = ht.count_kmers_within_radius(
subseq[middle:middle + K], RADIUS,
MAX_DENSITY)
density /= float(RADIUS)
print >>outfp, '>%s d=%.3f\n%s' % (record['name'], density, subseq)
if __name__ == '__main__':
main()