codonpdx-python/codonpdx/count.py at master · PDX-Flamingo/codonpdx-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import gzip
import json
import random
import Bio
import Bio.SeqIO

from ctypes import CDLL, c_char_p
from CodonCountStruct import CodonCount


# convert ctypes structure into a python dictionary
def getdict(struct):
    return dict(
        (field, getattr(struct, field)) for field, _ in struct._fields_
        )


# produce JSON from data object
def writecounts(data, pretty):
    if pretty:
        sort = True
        ident = 2
    else:
        sort = False
        ident = None
    return json.dumps(data, sort_keys=sort, indent=ident)


# count codons and produce json containing the organism information & counts
# args.infile = path to a file to parse
# args.format = 'fasta' or 'genbank', the format of the file
# args.pretty = boolean, whether or not to pretty-print the resulting JSON
# args.output = file to output the JSON to
# args.shuffle = whether to also include shuffle data

def count(args):
    counterc = CDLL('./lib/counterc.so')
    counterc.countcodons.argtypes = (c_char_p,)
    counterc.countcodons.restype = CodonCount

    data = []

    if args.gzip:
        handle = gzip.open(args.infile)
    else:
        handle = args.infile

    for seq_record in Bio.SeqIO.parse(handle, args.format):
        # only bother producing output if there is actual sequence data;
        # i.e., not all unknowns
        if len(seq_record.seq) != seq_record.seq.count("N"):
            # count the codons and put them in the data object along with
            # other metadata

            sequence = str(seq_record.seq).upper()
            cstruct = counterc.countcodons(sequence)

            # get taxonomy information
            tax = ""
            if 'taxonomy' in seq_record.annotations:
                tax = '; '.join(seq_record.annotations['taxonomy'])

            if args.shuffle:
                random.seed()
                l = list(sequence)
                letter = l.pop(0)
                l.append(letter)
                random.shuffle(l)
                shufflesequence = ''.join(l)
                shufflestruct = \
                    counterc.countcodons(shufflesequence)
                data += [{
                    # accession.version if we have a genbank file
                    # otherwise we should use the job uuid
                    "id": args.job if args.format ==
                    'fasta' else seq_record.id,
                    # taxonomy information
                    "taxonomy": tax,
                    "description": seq_record.description,
                    "codoncount": getdict(cstruct),
                    "shufflecodoncount": getdict(shufflestruct)
                }]
            else:
                data += [{
                         # accession.version if we have a genbank file
                         # otherwise we should use the job uuid
                         "id": args.job if args.format == 'fasta' else
                         seq_record.id,
                         # taxonomy information
                         "taxonomy": tax,
                         "description": seq_record.description,
                         "codoncount": getdict(cstruct)
                         }]

    # only write if data exists (i.e., we actually had sequence data
    if data:
        outputjson = writecounts(data, args.pretty)
        if args.output:
            args.output.write(outputjson)
        return outputjson