-
Notifications
You must be signed in to change notification settings - Fork 1
/
Snakefile
68 lines (61 loc) · 2.62 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
configfile: "./config/config.yaml"
# reference files paths
annotation_path = config["annotation_path"]
STAR_index_path = config["STAR_index_path"]
gtf_path = config["gtf_path"]
salmon_index_path = config["salmon_index_path"]
fa_path = config["fa_path"]
# Read samples.tsv to extract Samples, fq1, fq2
Samples = []
fq1 = []
fq2 = []
with open(config["samples"]) as file:
for line in file:
l = line.strip().split(' ')
if len(l) == 3:
Samples.append(l[0])
fq1.append(l[1])
fq2.append(l[2])
# Assume the full sample name is within the fq1 and fq2 file name
# Get the suffix of the seq file name excluding the sample name, used in starAlign.smk and salmonQuant.smk
fq1_suffix = fq1[0].replace(Samples[0], "")
fq2_suffix = fq2[0].replace(Samples[0], "")
###### Get wildcards for Reads (accomodate 3 patterns: .fastq.gz, .fq.gz, .fastq)
# Define the patterns with possible suffixes
patterns = ["data/{Read}.fastq.gz", "data/{Read}.fastq", "data/{Read}.fq.gz", "data/{Read}.fq"]
# Use glob_wildcards to find matching files
Reads = set()
for pattern in patterns:
Reads.update(glob_wildcards(pattern).Read)
# Convert set to list
Reads = list(Reads)
###### Perform fastQC based on the file type
if ".fastq.gz" in fq1_suffix:
include: "rules/preAlignQC_fastq_gz.smk",
elif ".fq.gz" in fq1_suffix:
include: "rules/preAlignQC_fq_gz.smk",
elif ".fastq" in fq1_suffix:
include: "rules/preAlignQC_fastq.smk",
else:
include: "rules/preAlignQC_fq.smk",
include: "rules/starAlign.smk",
include: "rules/salmonQuant.smk",
include: "rules/countMatrix.smk",
include: "rules/rsem.smk",
rule all:
input:
expand("results/fastqc_results/{Read}_fastqc.zip", Read=Reads),
expand("results/fastqc_results/{Read}_fastqc.html", Read=Reads),
expand("results/STAR_results/{Sample}/{Sample}_Log.final.out", Sample=Samples),
"results/star_wide_countMatrix.csv",
"results/star_wide_countMatrix.Rds",
expand("results/salmon_results/{Sample}/quant.sf", Sample=Samples),
"results/salmon_wide_TPM_Matrix.csv",
"results/salmon_wide_TPM_Matrix.Rds",
multiext("rsem_ref/human_gencode", ".chrlist", ".n2g.idx.fa", ".transcripts.fa", ".grp", ".seq", ".idx.fa", ".ti"),
expand("results/rsem_results/{Sample}/{Sample}.genes.results", Sample=Samples),
expand("results/rsem_results/{Sample}/{Sample}.isoforms.results", Sample=Samples),
"results/rsem_geneLevel_wide_TPM_Matrix.csv",
"results/rsem_geneLevel_wide_TPM_Matrix.Rds",
"results/rsem_isoformLevel_wide_TPM_Matrix.csv",
"results/rsem_isoformLevel_wide_TPM_Matrix.Rds",