Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 22 additions & 30 deletions grepfqparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,24 @@
def main():
#parse command line options
try:
opts, arg = getopt.getopt(sys.argv[1:],"h", ["help"])
opts, arg = getopt.getopt(sys.argv[1:],"ht:", ["help"])
except getopt.error, msg:
print msg
print "for help use --help"
sys.exit(2)
# process options
offset = 0
for o, a in opts:
if o == '-t':
offset = int(a)
print "using offset", offset
if o in ("-h", "--help"):
print __doc__
sys.exit(0)
if len(arg) < 3:
print "\nUsage: python grepfqparser.py <input_fastq> <barcode_file> <output_folder>\m"
print "\nUsage: python grepfqparser.py [options] <input_fastq> <barcode_file> <output_folder>\m"
sys.exit(0)
#process arguments

fqFile = arg[0]
bcFile = arg[1]
OutFolder = arg[2]
Expand All @@ -83,7 +86,7 @@ def main():
else:
gzbool = "NO"

print "fq file gzipped? = %s" %(gzbool)
print "fq file gzipped? = %s" %(gzbool)
if gzbool == "YES":
print "unzipping file"
tempfq = open("tempfq",'w')
Expand All @@ -101,31 +104,18 @@ def main():
barcode_up = barcode.upper()
name = lineItems[1]
print barcode

parsed_file_step1_name = str(OutFolder + "/indiv" + name + "_" + barcode + "firstgrep")
parsed_file_step1 = open(parsed_file_step1_name,'w')
errlog = open("errlog2",'w')
cmd = 'grep -B 1 -A 2 ^%s %s' % (barcode_up, fqFile)
subprocess.call(cmd, shell=True,stdout=parsed_file_step1, stderr=errlog)
#(note: pipe into sed to remove barcodes and associated quality scores from each line)
cmd = "grep -B 1 -A 2 ^%s %s | sed '2~2s/^%s//g'" % (barcode_up, fqFile, '.'*len(barcode_up))
subprocess.call(cmd, shell=True,stdout=parsed_file_step1, stderr=errlog)
errlog.close()
parsed_file_step1.close()

"""grep with -B and -A produces spacer marks '--' in file. Cannot figure out how to suppress these,
so remove and paste into new file, then delete original"""

parsed_file_name = str(OutFolder + "/indiv" + name + "_" + barcode)
parsed_file = open(parsed_file_name,'w')
errlog = open("errlog3",'w')
cmd = 'awk "!/^--$/" %s' % (parsed_file_step1_name)
subprocess.check_call(cmd,shell=True,stdout=parsed_file,stderr=errlog)
errlog.close()
parsed_file.close()
cmd = 'rm %s' % (parsed_file_step1_name)
subprocess.check_call(cmd,shell=True)

errlog = open("errlog2",'w')
#First grep finds lines starting with the barcode and includes the line above, and two lines below each match
#Pipe into grep again to filter out -- between matches which some (versions??) of grep insert
#Pipe into sed to remove barcodes (optional offset) and associated quality scores from each line
cmd = """grep -B 1 -A 2 ^%s %s | grep -v "^--$" | sed '2~2s/^%s//g'""" % (barcode_up, fqFile, '.'*(len(barcode_up)+offset))
try:
failed = subprocess.call(cmd, shell=True,stdout=parsed_file, stderr=errlog)
finally:
errlog.close()
parsed_file.close()
bc.close()

"""Now collect all unparsed reads"""
Expand All @@ -146,9 +136,11 @@ def main():
nomatch_file = open(OutFolder + "/nomatches",'w')
errlog = open("errlog4",'w')
cmd = "awk 'NR%%4==2' %s | grep -f %s -v" % (fqFile, bconly)
nomatch = subprocess.call(cmd, shell=True, stdout=nomatch_file,stderr=errlog)
errlog.close()
nomatch_file.close()
try:
nomatch = subprocess.call(cmd, shell=True, stdout=nomatch_file,stderr=errlog)
finally:
errlog.close()
nomatch_file.close()

"""delete tempfq, the gunzipped original file"""
cmd = 'rm tempfq bcOnly errlog1 errlog2 errlog3 errlog4'
Expand Down