-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcount.py
More file actions
100 lines (85 loc) · 3.32 KB
/
count.py
File metadata and controls
100 lines (85 loc) · 3.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import gzip
import json
import random
import Bio
import Bio.SeqIO
from ctypes import CDLL, c_char_p
from CodonCountStruct import CodonCount
# convert ctypes structure into a python dictionary
def getdict(struct):
return dict(
(field, getattr(struct, field)) for field, _ in struct._fields_
)
# produce JSON from data object
def writecounts(data, pretty):
if pretty:
sort = True
ident = 2
else:
sort = False
ident = None
return json.dumps(data, sort_keys=sort, indent=ident)
# count codons and produce json containing the organism information & counts
# args.infile = path to a file to parse
# args.format = 'fasta' or 'genbank', the format of the file
# args.pretty = boolean, whether or not to pretty-print the resulting JSON
# args.output = file to output the JSON to
# args.shuffle = whether to also include shuffle data
def count(args):
counterc = CDLL('./lib/counterc.so')
counterc.countcodons.argtypes = (c_char_p,)
counterc.countcodons.restype = CodonCount
data = []
if args.gzip:
handle = gzip.open(args.infile)
else:
handle = args.infile
for seq_record in Bio.SeqIO.parse(handle, args.format):
# only bother producing output if there is actual sequence data;
# i.e., not all unknowns
if len(seq_record.seq) != seq_record.seq.count("N"):
# count the codons and put them in the data object along with
# other metadata
sequence = str(seq_record.seq).upper()
cstruct = counterc.countcodons(sequence)
# get taxonomy information
tax = ""
if 'taxonomy' in seq_record.annotations:
tax = '; '.join(seq_record.annotations['taxonomy'])
if args.shuffle:
random.seed()
l = list(sequence)
letter = l.pop(0)
l.append(letter)
random.shuffle(l)
shufflesequence = ''.join(l)
shufflestruct = \
counterc.countcodons(shufflesequence)
data += [{
# accession.version if we have a genbank file
# otherwise we should use the job uuid
"id": args.job if args.format ==
'fasta' else seq_record.id,
# taxonomy information
"taxonomy": tax,
"description": seq_record.description,
"codoncount": getdict(cstruct),
"shufflecodoncount": getdict(shufflestruct)
}]
else:
data += [{
# accession.version if we have a genbank file
# otherwise we should use the job uuid
"id": args.job if args.format == 'fasta' else
seq_record.id,
# taxonomy information
"taxonomy": tax,
"description": seq_record.description,
"codoncount": getdict(cstruct)
}]
# only write if data exists (i.e., we actually had sequence data
if data:
outputjson = writecounts(data, args.pretty)
if args.output:
args.output.write(outputjson)
return outputjson