-
Notifications
You must be signed in to change notification settings - Fork 0
/
qfilesplitterV3.1.py
163 lines (138 loc) · 4.39 KB
/
qfilesplitterV3.1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
import time
import sys
#inputFile = "porechop.fastq"
inputFile = "porechop.test.fastq"
outputFile = "./"
blockSize = 0
file_type = ""
def get_identifier(block):
ids = 0
ide = block.find(' runid', ids)
if ide > 0 and ide < 50:
return block[ids:ide].split('_')[0][1:]
def get_identifier_fasta(block):
header = block.split('\n')[0]
header = header.replace(';','')
id_index = header.find(' ')
if(id_index == -1):
id_index = len(header)
#print("header:{}; index {} ".format(header, id_index))
return block[1:id_index].split('_')[0]
def create_output_folder():
global outputFile
outputFile = "{}output/".format(outputFile)
if not os.path.exists(outputFile):
os.mkdir(outputFile)
def test_block(block):
#print "Block Test"
lines = block.split('\n')
if len(lines)==5:
if "runid" in lines[0] and "sampleid" in lines[0] and "read" in lines[0]:
if len(lines[1]) == len(lines[3]):
return False
else:
print (lines[0])
print ("Error: lengths doesn't match")
else:
print (lines[0])
print("Error: Bad Identifier")
else:
print (lines[0])
print ("Error: Block with wrong format")
return True
def process_block_fastaq(block):
if test_block(block):
return "Error"
identifier = get_identifier(block)
if identifier in identifiers.keys():
identifiers[identifier] += block
identifiers_blockCount[identifier] += 1
else:
identifiers[identifier] = block
identifiers_blockCount[identifier] = 1
def process_block_fasta(block):
identifier = get_identifier_fasta(block)
# print("Processign block:{} \n {}".format(identifier,block))
#print("Processign block:{} ".format(identifier))
if identifier in identifiers.keys():
identifiers[identifier] += block
identifiers_blockCount[identifier] += 1
else:
identifiers[identifier] = block
identifiers_blockCount[identifier] = 1
def save_files():
create_output_folder()
for id, value in identifiers.items():
print id, identifiers_blockCount[id]
if identifiers_blockCount[id] >= int(blockSize):
with open("{1}{0}.{2}".format(id, outputFile, file_type), 'w') as f:
f.write(value)
def split_operation():
block = ""
totLines = 0
with open(inputFile) as infile:
for line in infile:
totLines += 1
identify_file_type(line)
if file_type == "fastq":
if line[0] == '@' and "runid" in line:
if block == "":
block += line
else:
process_block_fastaq(block)
block = line
else:
block += line
elif file_type == "fasta":
if line[0] == '>':
if block == "":
block += line
else:
process_block_fasta(block)
block = line
# print ("*****************")
# print(block)
# exit()
else:
block += line
if file_type == "fastq":
process_block_fastaq(block)
elif file_type == "fasta":
process_block_fasta(block)
print("Lines found:{}".format(totLines))
save_files()
def identify_file_type(line):
global file_type
if file_type == "":
if line[0] == '@' and "runid" in line:
file_type = "fastq"
print("File type:{}".format(file_type))
elif line[0]== '>':
file_type = "fasta"
print("File type:{}".format(file_type))
else:
file_type = "Error"
print("File type:{}".format(file_type))
start_time = time.time()
identifiers = {}
identifiers_blockCount = {}
if len(sys.argv) > 1:
for i in range(1, len(sys.argv)):
if sys.argv[i] == "-i":
inputFile = sys.argv[i+1]
elif sys.argv[i] == "-o":
outputFile = sys.argv[i + 1]
elif sys.argv[i] == "-b":
blockSize = sys.argv[i + 1]
split_operation()
else:
print """
No arguments given
python qfilespliter.py [Arguments]
Arguments:
-i input file
-o output path
-b blocks size cutoff [optional]
"""
print("--- %s seconds ---" % (time.time() - start_time))