Skip to content

Commit 12c96cb

Browse files
committed
Renamed STAR scripts, bug fixes
1 parent 4eaeebc commit 12c96cb

14 files changed

+341
-17
lines changed

.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Byte compiled
2+
*.py[cod]
3+
__pycache__/
4+
*$py.class
5+

README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ templates/ - mako template for the qsub script
66

77
So far, I have classes that will handle these aligners
88

9-
* Topha2
9+
* Tophat2
1010
* Cufflinks/Cuffquant
1111
* featureCount (subread)
1212
* kallisto
@@ -15,3 +15,4 @@ So far, I have classes that will handle these aligners
1515
* SpliceTrap
1616

1717

18+

bin/make_STAR.py

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/bin/env python
2+
3+
import os, sys
4+
import json
5+
import argparse
6+
sys.path.append("/home/rwang/rtwcode/rnaseq_tools/scripts")
7+
from star import *
8+
from index import *
9+
from sge import *
10+
11+
# Call variants from RNAseq using STAR aligner
12+
# this is broad/gatk recommended workflow
13+
14+
def makeSTARScripts(basedir, samples, reference):
15+
pass
16+
17+
# assume genome index file already exists (1st pass)
18+
# align
19+
STARIndex = Index(reference, "STAR")
20+
index = STARIndex.output()
21+
22+
for samp in samples:
23+
read1 = os.path.join(basedir, samp, "00-raw", samp+"_1.fastq.gz")
24+
read2 = os.path.join(basedir, samp, "00-raw", samp+"_2.fastq.gz")
25+
26+
outputdir = os.path.join(basedir, samp, "03-alignSTAR" )
27+
if not os.path.exists(outputdir):
28+
os.makedirs(outputdir)
29+
outFileNamePrefix = os.path.join(outputdir, samp)
30+
sa = STARAligner( index, read1, read2, outFileNamePrefix, bamout=True, threads=48, mem='100G')
31+
cmdtxt = sa.makeCommand()
32+
33+
qsub = SGE(samp, "/home/rwang/rtwcode/rnaseq_tools/templates/qsub_tophat.tmpl")
34+
args = {'command':cmdtxt, 'jobname': str(samp)+str(reference), 'jobmem':'100G', 'logfilename': "_".join([str(samp), "STAR", str(reference)+".log"])}
35+
outscript = os.path.join(basedir, samp, str(samp) + "_STAR_" + str(reference) + ".sh")
36+
print outscript
37+
qsub.createJobScript(outscript, **args)
38+
39+
40+
41+
if __name__=="__main__":
42+
parser = argparse.ArgumentParser()
43+
parser.add_argument("configfile", help="config file with options: eg config_tophat2.json")
44+
args = parser.parse_args()
45+
46+
config = json.loads(open(args.configfile).read())
47+
makeSTARScripts(config['basedir'], config['samples'], config['reference'])

bin/make_STAR2pass.py

+69
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/bin/env python
2+
3+
import os, sys
4+
import json
5+
import argparse
6+
sys.path.append("/home/rwang/rtwcode/rnaseq_tools/scripts")
7+
from star import *
8+
from index import *
9+
from sge import *
10+
11+
# Call variants from RNAseq using STAR aligner
12+
# this is broad/gatk recommended workflow
13+
14+
def makeSTARVariantScripts(basedir, samples, reference):
15+
pass
16+
17+
# assume genome index file already exists (1st pass)
18+
# align
19+
STARIndex = Index(reference, "STAR")
20+
index = STARIndex.output()
21+
22+
for samp in samples:
23+
read1 = os.path.join(basedir, samp, "00-raw", samp+"_1.fastq.gz")
24+
read2 = os.path.join(basedir, samp, "00-raw", samp+"_2.fastq.gz")
25+
26+
# 1st pass alignment
27+
outputdir = os.path.join(basedir, samp, "07-variants", "step1align")
28+
if not os.path.exists(outputdir):
29+
os.makedirs(outputdir)
30+
outFileNamePrefix = os.path.join(outputdir, samp)
31+
sa = STARAligner( index, read1, read2, outFileNamePrefix, bamout=True, threads=48, mem='100G')
32+
cmdtxt = sa.makeCommand()
33+
34+
#make 2nd index using SJ from 1st pass
35+
outputdir2 = os.path.join(basedir, samp, "07-variants", "step2reindex")
36+
if not os.path.exists(outputdir2):
37+
os.makedirs(outputdir2)
38+
hg19fasta = "/home/rwang/indexes/hg19/igenomes/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa"
39+
si = STARIndexCreator(outputdir2, hg19fasta,
40+
SJout= os.path.join(outputdir, samp+"SJ.out.tab"),
41+
SJoverhang = 75)
42+
cmdtxt = cmdtxt + "\n" + si.makeCommand()
43+
44+
print cmdtxt
45+
46+
# second alignment
47+
outputdir3 = os.path.join(basedir, samp, "07-variants", "step3align2")
48+
if not os.path.exists(outputdir3):
49+
os.makedirs(outputdir3)
50+
outFileNamePrefix2 = os.path.join(outputdir3, samp)
51+
sa = STARAligner( outputdir2, read1, read2, outFileNamePrefix2, bamout=True, threads=48, mem='100G')
52+
53+
cmdtxt = cmdtxt + "\n" + sa.makeCommand()
54+
55+
qsub = SGE(samp, "/home/rwang/rtwcode/rnaseq_tools/templates/qsub_tophat.tmpl")
56+
args = {'command':cmdtxt, 'jobname': str(samp)+str(reference), 'jobmem':'100G', 'logfilename': "_".join([str(samp), "STAR", str(reference)+".log"])}
57+
outscript = os.path.join(basedir, samp, str(samp) + "_STAR2pass_" + str(reference) + ".sh")
58+
print outscript
59+
qsub.createJobScript(outscript, **args)
60+
61+
62+
63+
if __name__=="__main__":
64+
parser = argparse.ArgumentParser()
65+
parser.add_argument("configfile", help="config file with options: eg config_tophat2.json")
66+
args = parser.parse_args()
67+
68+
config = json.loads(open(args.configfile).read())
69+
makeSTARVariantScripts(config['basedir'], config['samples'], config['reference'])

bin/make_STAR2passMendelianSeq.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#!/bin/env python
2+
3+
import os, sys
4+
import json
5+
import argparse
6+
sys.path.append("/home/rwang/rtwcode/rnaseq_tools/scripts")
7+
from star import *
8+
from index import *
9+
from sge import *
10+
11+
# follow macarthur lab 2pass alignment of muscle dystrophy samples
12+
# this is specifcally for the 2nd pass where we use
13+
# a merged and filtered index that was generated previously
14+
15+
def makeSTAR2passMendelianSeq(basedir, samples, mergedReference, outdir):
16+
pass
17+
18+
# assume genome index file already exists (1st pass)
19+
# align
20+
#STARIndex = Index(reference, "STAR")
21+
#index = STARIndex.output()
22+
23+
index = mergedReference
24+
reference = "MendelianSeq"
25+
26+
for samp in samples:
27+
read1 = os.path.join(basedir, samp, "00-raw", samp+"_1.fastq.gz")
28+
read2 = os.path.join(basedir, samp, "00-raw", samp+"_2.fastq.gz")
29+
30+
# 1st pass alignment
31+
outputdir = os.path.join(basedir, samp, outdir)
32+
if not os.path.exists(outputdir):
33+
os.makedirs(outputdir)
34+
outFileNamePrefix = os.path.join(outputdir, samp)
35+
sa = STARAligner( index, read1, read2, outFileNamePrefix, bamout=True, threads=48, mem='100G')
36+
cmdtxt = sa.makeCommand()
37+
38+
qsub = SGE(samp, "/home/rwang/rtwcode/rnaseq_tools/templates/qsub_tophat.tmpl")
39+
args = {'command':cmdtxt, 'jobname': str(samp)+str(reference), 'jobmem':'100G', 'logfilename': "_".join([str(samp), "STAR2passMendelianSeq", str(reference)+".log"])}
40+
outscript = os.path.join(basedir, samp, str(samp) + "_STAR2passMendelianSeq_" + str(reference) + ".sh")
41+
print outscript
42+
qsub.createJobScript(outscript, **args)
43+
44+
45+
46+
if __name__=="__main__":
47+
parser = argparse.ArgumentParser()
48+
parser.add_argument("configfile", help="config file with options: eg config_STAR2passMendelian.json")
49+
args = parser.parse_args()
50+
51+
config = json.loads(open(args.configfile).read())
52+
makeSTAR2passMendelianSeq(config['basedir'], config['samples'], config['mergedReference'], config['outdir'])

bin/make_bamfile_index.py

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/bin/env python
2+
3+
import os, sys
4+
import json
5+
import argparse
6+
sys.path.append("/home/rwang/rtwcode/rnaseq_tools/scripts")
7+
from samtools import *
8+
from sge import *
9+
10+
# Index all bam files
11+
12+
def makeBamfileIndex(basedir, samples ):
13+
14+
for samp in samples:
15+
fullcmdtxt = ""
16+
for subdir in ["03-align", "03-alignDMD", "03-alignSTAR"]:
17+
if subdir=="03-alignSTAR":
18+
bamfilePath = os.path.join(basedir, samp, subdir, samp+"Aligned.sortedByCoord.out.bam")
19+
sa = Samtools( "index", bamfilePath)
20+
cmdtxt = sa.makeCommand()
21+
elif subdir=="03-align":
22+
bamfilePath = os.path.join(basedir, samp, subdir, samp+"_transcriptome.bam")
23+
sa = Samtools( "index", bamfilePath)
24+
cmdtxt = sa.makeCommand()
25+
elif subdir=="03-alignDMD":
26+
bamfilePath = os.path.join(basedir, samp, subdir, samp+"_427m.bam")
27+
sa = Samtools( "index", bamfilePath)
28+
cmdtxt = sa.makeCommand()
29+
30+
fullcmdtxt = fullcmdtxt + cmdtxt + "\n"
31+
print fullcmdtxt
32+
33+
qsub = SGE(samp, "/home/rwang/rtwcode/rnaseq_tools/templates/qsub_tophat.tmpl")
34+
args = {'command':fullcmdtxt, 'jobname': str(samp)+"index", 'jobmem':'10G', 'logfilename': "_".join([str(samp), "index.log"])}
35+
outscript = os.path.join(basedir, samp, str(samp) + "_bamindex.sh")
36+
print outscript
37+
qsub.createJobScript(outscript, **args)
38+
39+
40+
if __name__=="__main__":
41+
parser = argparse.ArgumentParser()
42+
parser.add_argument("configfile", help="config file with options: eg config_tophat2.json")
43+
args = parser.parse_args()
44+
45+
config = json.loads(open(args.configfile).read())
46+
makeBamfileIndex(config['basedir'], config['samples'])

bin/make_fastqc.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/bin/env python
2+
3+
import os, sys
4+
import json
5+
import argparse
6+
sys.path.append("/home/rwang/rtwcode/rnaseq_tools/scripts")
7+
from fastqc import *
8+
from sge import *
9+
10+
11+
def makeFastQCscripts(basedir, samples):
12+
13+
for samp in samples:
14+
read1 = os.path.join(basedir, samp, "00-raw", samp+"_1.fastq.gz")
15+
read2 = os.path.join(basedir, samp, "00-raw", samp+"_2.fastq.gz")
16+
17+
outputdir = os.path.join(basedir, samp, "02-FastQC" )
18+
if not os.path.exists(outputdir):
19+
os.makedirs(outputdir)
20+
21+
outFileNamePrefix = os.path.join(outputdir, samp)
22+
fileList = " ".join([read1, read2]) # read1.gz read2.gz
23+
FastQC_obj = FastQC( outputdir, fileList=fileList)
24+
cmdtxt = FastQC_obj.makeCommand()
25+
26+
qsub = SGE(samp, "/home/rwang/rtwcode/rnaseq_tools/templates/qsub_tophat.tmpl")
27+
args = {'command':cmdtxt, 'jobname': str(samp)+"FastQC", 'jobmem':'4G', 'logfilename': "_".join([str(samp), "FastQC.log"])}
28+
outscript = os.path.join(basedir, samp, str(samp) + "_FastQC" + ".sh")
29+
print outscript
30+
qsub.createJobScript(outscript, **args)
31+
32+
33+
34+
if __name__=="__main__":
35+
parser = argparse.ArgumentParser()
36+
parser.add_argument("configfile", help="config file with options: eg config_tophat2.json")
37+
args = parser.parse_args()
38+
39+
config = json.loads(open(args.configfile).read())
40+
makeFastQCscripts(config['basedir'], config['samples'] )

bin/make_featureCount.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,19 @@
2020
# basedir
2121
# reference (hg19, dmd transcript)
2222

23-
def makeFeatureCountScripts( samples, basedir, annot):
23+
def makeFeatureCountScripts( samples, basedir, annot, bamtype='tophat2'):
2424

2525
#ANNOT='/home/rwang/scratch1/rnaseq/datasets/gencode/release24/gencode.v24lift37.basic.annotation.gtf'
2626
ANNOT='/home/rwang/indexes/hg19/igenomes/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf'
2727

2828
for samp in samples:
29-
bamfile = os.path.join(basedir, samp, "03-align", samp+"_transcriptome.bam")
29+
if bamtype=="tophat2":
30+
bamfile = os.path.join(basedir, samp, "03-align", samp+"_transcriptome.bam")
31+
elif bamtype=="STAR":
32+
bamfile = os.path.join(basedir, samp, "03-alignSTAR", samp+"Aligned.sortedByCoord.out.bam")
33+
else:
34+
sys.exit("bamtype paramter must be 'tophat' or 'STAR'")
35+
3036
outputdir = os.path.join(basedir, samp, "05-featureCount")
3137
if not os.path.exists(outputdir):
3238
os.makedirs(outputdir)
@@ -63,6 +69,11 @@ def makeFeatureCountScripts( samples, basedir, annot):
6369
config = json.loads(open(args.configfile).read())
6470
# test if samples, basedir, reference are specified in JSON file
6571
if all(k in config for k in ('samples', 'basedir', 'annotation')) :
66-
print config['samples']
67-
makeFeatureCountScripts(config['samples'], config['basedir'], annot=config['annotation'])
72+
if 'bamtype' in config.keys():
73+
bamtype=config['bamtype']
74+
else:
75+
bamtype='STAR'
76+
print "samples are " + " ".join(config['samples'])
77+
print "bamtype is " + str(bamtype)
78+
makeFeatureCountScripts(config['samples'], config['basedir'], annot=config['annotation'], bamtype=bamtype)
6879

bin/make_splicetrap.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -19,20 +19,30 @@
1919
# basedir
2020
# reference (hg19, dmd transcript)
2121

22-
def makeSpliceTrapScripts(basedir, samples, reference, readsize, cutoff, outputFilePrefix ):
22+
def makeSpliceTrapScripts(basedir, samples, reference, readsize, cutoff ):
2323

2424
#transcriptIndex = '/share/apps/richard/kallisto/kallisto_linux-v0.42.5/index/ensembl_GRCh37_transcripts_index'
2525
transcriptIndex = reference
2626
numThreads = 4
2727
for samp in samples:
28-
# hack to decompress fastq.gz on the fly
28+
# hack to decompress fastq.gz on the fly, does not work
2929
#read1 = "<(gunzip -c " + os.path.join(basedir, samp, "00-raw", samp + "_1.fastq.gz") + ")"
3030
#read2 = "<(gunzip -c " + os.path.join(basedir, samp, "00-raw", samp + "_2.fastq.gz") + ")"
31+
32+
# need to check if fastq is GZIPPED and uncompress first if so
3133
read1 = os.path.join(basedir, samp, "00-raw", samp + "_1.fastq")
3234
read2 = os.path.join(basedir, samp, "00-raw", samp + "_2.fastq")
35+
36+
if not os.path.isfile(read1):
37+
read1gz = os.path.join(basedir, samp, "00-raw", samp + "_1.fastq.gz")
38+
read2gz = os.path.join(basedir, samp, "00-raw", samp + "_2.fastq.gz")
39+
cmdtxt = "gunzip -c " + read1gz + " > " + os.path.join(basedir, samp, "00-raw", samp + "_1.fastq") + "\n"
40+
cmdtxt += "gunzip -c " + read2gz + " > " + os.path.join(basedir, samp, "00-raw", samp + "_2.fastq") + "\n"
41+
3342
outputdir = os.path.join(basedir, samp, "05-splicetrap")
3443
if not os.path.exists(outputdir):
3544
os.makedirs(outputdir)
45+
outputFilePrefix = samp
3646

3747
nameOfJob = samp + "splicetrap"
3848
sj = SpliceTrap(readsize,
@@ -45,19 +55,21 @@ def makeSpliceTrapScripts(basedir, samples, reference, readsize, cutoff, outputF
4555
nameOfJob,
4656
reference
4757
)
48-
cmdtxt = sj.makeCommand()
58+
cmdtxt += sj.makeCommand()
59+
4960
print cmdtxt
5061

5162
qsub = SGE(samp, "/home/rwang/rtwcode/rnaseq_tools/templates/qsub_tophat.tmpl")
5263
args = {'command':cmdtxt, 'jobname': str(samp)+"splicetrap", 'jobmem':'20G', 'logfilename': "_".join([str(samp), "splicetrap.log"])}
5364
outscript = os.path.join(basedir, samp, str(samp) + "_splicetrap" + ".sh")
5465
print outscript
5566
qsub.createJobScript(outscript, **args)
67+
cmdtxt = ""
5668

5769
# generate all tophat scripts:
5870

5971
#samples = [ "DDX7", "DDX8", "DDX9", "SH790"]
6072
#basedir='/home/rwang/scratch1/rnaseq/human/Feb12/'
6173
config = json.loads(open("config_splicetrap.json").read())
62-
makeSpliceTrapScripts(config['basedir'], config['samples'], config['reference'], config['readsize'], config['cutoff'], config['outputFilePrefix'] )
74+
makeSpliceTrapScripts(config['basedir'], config['samples'], config['reference'], config['readsize'], config['cutoff'] )
6375

bin/rename_tophat_bams.py

+2
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@
1010

1111
def renameTophatBamToSample(samples, basedir, reference):
1212
for samp in samples:
13+
# Tophat
1314
if reference in ("hg19", "mm9"):
1415
bamfile = os.path.join(basedir, samp, "03-align", "accepted_hits.bam")
1516
newbamfile=os.path.join(basedir, samp, "03-align", samp + "_transcriptome.bam")
17+
# align dmd
1618
elif reference in ("dmd427m"):
1719
bamfile = os.path.join(basedir, samp, "03-alignDMD", "accepted_hits.bam")
1820
newbamfile=os.path.join(basedir, samp, "03-alignDMD", samp + "_427m.bam")

0 commit comments

Comments
 (0)