-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathexpandSeq.py
executable file
·166 lines (148 loc) · 4.03 KB
/
expandSeq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/python
import re
import sys
import os
import getopt
from itertools import product
def main():
params = parseArgs()
if params.fasta:
file_object = open("out.fasta", "w")
for seq in read_fasta(params.fasta):
count = 1
for i in expandAmbiquousDNA(seq[1]):
header = ">" + str(seq[0]) + "." + str(count)+ "\n"
sequence = str(i) + "\n"
file_object.write(header)
file_object.write(sequence)
count += 1
file_object.close()
elif params.seq:
for i in expandAmbiquousDNA(params.seq):
print(i)
else:
sys.exit("No input provided.")
#Object to parse command-line arguments
class parseArgs():
def __init__(self):
#Define options
try:
options, remainder = getopt.getopt(sys.argv[1:], 's:f:h', \
["seq=","fasta=","help"])
except getopt.GetoptError as err:
print(err)
self.display_help("\nExiting because getopt returned non-zero exit status.")
#Default values for params
#Input params
self.seq=None
self.fasta=None
#First pass to see if help menu was called
for o, a in options:
if o in ("-h", "-help", "--help"):
self.display_help("Exiting because help menu was called.")
#Second pass to set all args.
for opt, arg_raw in options:
arg = arg_raw.replace(" ","")
arg = arg.strip()
opt = opt.replace("-","")
#print(opt,arg)
if opt in ('s', 'seq'):
self.seq = arg
elif opt in ('h', 'help'):
pass
elif opt in ('f','fasta'):
self.fasta = arg
else:
assert False, "Unhandled option %r"%opt
#Check manditory options are set
if self.seq and self.fasta:
sys.exit("Error: Input either -s, or -f. Not both.")
if not self.seq and not self.fasta:
sys.exit("Error: Input either -s, or -f.")
def display_help(self, message=None):
if message is not None:
print()
print (message)
print ("\nexpandSeq.py\n")
print ("Contact:Tyler K. Chafin, University of Arkansas,tkchafin@uark.edu")
print ("\nUsage: ", sys.argv[0], "-s AGTGATAGTAGTGRRTGAYAGAGT \n")
print ("Description: expandSeq.py expands DNA sequences with ambiguities to a list of all possible variants.")
print("""
Input options:
-s,--seq : Sequence string to expand (results output to stdout)
or
-f,--fasta : You can also specify a FASTA file. Results will be output as FASTA.
-h,--help : Displays help menu""")
print()
sys.exit()
#Function to split character to IUPAC codes, assuing diploidy
def get_iupac_caseless(char):
lower = False
if char.islower():
lower = True
char = char.upper()
iupac = {
"A" : ["A"],
"G" : ["G"],
"C" : ["C"],
"T" : ["T"],
"N" : ["A", "C", "G", "T"],
"-" : ["-"],
"R" : ["A","G"],
"Y" : ["C","T"],
"S" : ["G","C"],
"W" : ["A","T"],
"K" : ["G","T"],
"M" : ["A","C"],
"B" : ["C","G","T"],
"D" : ["A","G","T"],
"H" : ["A","C","T"],
"V" : ["A","C","G"]
}
ret = iupac[char]
if lower:
ret = [c.lower() for c in ret]
return ret
#Read genome as FASTA. FASTA header will be used
#This is a generator function
#Doesn't matter if sequences are interleaved or not.
def read_fasta(fas):
if not fileCheck(fas):
raise FileNotFoundError("Fatal exception, file %s not found."%fas)
fh = open(fas)
try:
with fh as file_object:
contig = ""
seq = ""
for line in file_object:
line = line.strip()
if not line:
continue
line = line.replace(" ","")
#print(line)
if line[0] == ">": #Found a header line
#If we already loaded a contig, yield that contig and
#start loading a new one
if contig:
yield([contig,seq]) #yield
contig = "" #reset contig and seq
seq = ""
contig = (line.replace(">",""))
else:
seq += line
#Iyield last sequence, if it has both a header and sequence
if contig and seq:
yield([contig,seq])
finally:
fh.close()
#Function to check if a file path is valid
def fileCheck(f):
return (os.path.isfile(f))
#Function to expand ambiguous sequences
#Generator function
def expandAmbiquousDNA(sequence):
for i in product(*[get_iupac_caseless(j) for j in sequence]):
yield("".join(i))
#Call main function
if __name__ == '__main__':
main()