Skip to content

Commit 62b5222

Browse files
committed
add new
1 parent 993b811 commit 62b5222

File tree

8 files changed

+350
-2
lines changed

8 files changed

+350
-2
lines changed

faSize.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
# @Time : 12/3/2020 3:42 PM
4+
# @Author : Runsheng
5+
# @File : faSize.py
6+
"""
7+
mimic the faSize -detailed result using UCSC kentutils
8+
"""
9+
import argparse
10+
from Bio import SeqIO
11+
12+
13+
def fa_size(fastafile, filetype="fastq"):
14+
"""
15+
Give a fasta file name, return a dict contains the name and seq
16+
Require Biopython SeqIO medule to parse the sequence into dict, a large genome may take a long time to parser
17+
"""
18+
len_list=[]
19+
handle=open(fastafile, "r")
20+
for contig in SeqIO.parse(handle,filetype):
21+
name=contig.name
22+
len_list.append( (name, len(contig)) )
23+
handle.close()
24+
return len_list
25+
26+
27+
def write_len(len_list, filename="out.sizes"):
28+
fw=open(filename, "w")
29+
for i in len_list:
30+
name, length=i
31+
fw.write(name+"\t"+str(length)+"\n")
32+
fw.close()
33+
34+
35+
if __name__=="__main__":
36+
37+
example_text = '''example:
38+
### example to run the faSize
39+
faSize.py --file file.fa --filetype fasta --output file.fa.sizes
40+
'''
41+
42+
parser = argparse.ArgumentParser(prog='faSize',
43+
epilog=example_text,
44+
formatter_class=argparse.RawDescriptionHelpFormatter)
45+
46+
parser.add_argument("-f", "--file", help="input file in fasta/fastq format")
47+
parser.add_argument("-o", "--output", help="the two column text file for name:size")
48+
parser.add_argument("-t", "--filetype", help="the file is fastq or fasta", default="fasta")
49+
args = parser.parse_args()
50+
51+
#main
52+
len_l=fa_size(fastafile=args.file, filetype=args.filetype)
53+
write_len(len_list=len_l, filename=args.output)
54+
55+

fa_merge.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,6 @@ def dic2fasta(record_dict,out="record_dict.fasta"):
113113

114114

115115

116+
117+
118+

get_near_ref.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
# @Time : 3/2/2021 5:32 PM
4+
# @Author : Runsheng
5+
# @File : get_near_ref.py
6+
"""
7+
from mutiple references, get the nearest reference for further polish
8+
mostly used for RNA virus reference choosing
9+
"""
10+
11+
from __future__ import print_function
12+
import os
13+
import argparse
14+
import subprocess
15+
import logging
16+
import sys
17+
import signal
18+
from Bio import SeqIO
19+
import gzip
20+
import operator
21+
from collections import OrderedDict
22+
23+
24+
def myexe(cmd, timeout=0):
25+
"""
26+
a simple wrap of the shell
27+
mainly used to run the bwa mem mapping and samtool orders
28+
"""
29+
def setupAlarm():
30+
signal.signal(signal.SIGALRM, alarmHandler)
31+
signal.alarm(timeout)
32+
33+
def alarmHandler(signum, frame):
34+
sys.exit(1)
35+
36+
proc=subprocess.Popen(cmd, shell=True, preexec_fn=setupAlarm,
37+
stdout=subprocess.PIPE, stderr=subprocess.PIPE,cwd=os.getcwd())
38+
out, err=proc.communicate()
39+
print(err)
40+
return out, err, proc.returncode
41+
42+
43+
def fastq2dic(fastqfile):
44+
"""
45+
Give a fastq file name, return a dict contains the name and seq
46+
Require Biopython SeqIO medule to parse the sequence into dict, a large readfile may take a lot of RAM
47+
"""
48+
if ".gz" in fastqfile:
49+
handle=gzip.open(fastqfile, "rU")
50+
else:
51+
handle=open(fastqfile, "rU")
52+
record_dict=SeqIO.to_dict(SeqIO.parse(handle, "fastq"))
53+
handle.close()
54+
return record_dict
55+
56+
57+
def chr_select(record_dict, chro):
58+
"""
59+
Note the start and end is 0 based
60+
give the name of refdic, and the chr, start and end to be used
61+
return the name and sequence (both as str)
62+
for example, chrcut(record_dict, "I", 100,109) returns
63+
("I:100_109","AAAAAAAAAA")
64+
"""
65+
name=record_dict[chro].name
66+
seq=str(record_dict[chro].seq)
67+
return name,seq
68+
69+
70+
def wrapper_run_get_bedfile(ref, fastq, core=32):
71+
72+
cmd_minimap2="""
73+
minimap2 -ax map-ont -t {core} {ref} {fastq} > map.sam
74+
samtools view -F 4 -b map.sam > map.bam
75+
samtools sort map.bam > maps.bam
76+
bedtools genomecov -ibam maps.bam -bga > {ref}.bed
77+
rm map.sam
78+
rm map.bam
79+
""".format(ref=ref, fastq=fastq, core=core)
80+
print(cmd_minimap2)
81+
print(myexe(cmd_minimap2))
82+
83+
return ref+".bed"
84+
85+
86+
def bed_parser_get_higest_coverage(bedfile):
87+
"""
88+
parser the bed file and get the refname which has higest coverage
89+
:param bedfile:
90+
:return:
91+
"""
92+
cov_sum={}
93+
94+
f=open(bedfile, "r")
95+
for line in f.readlines():
96+
line_l=line.strip().split("\t")
97+
name, start, end, coverage=line_l
98+
try:
99+
cov_sum[name]+=(int(end)-int(start)) * int(coverage)
100+
except KeyError:
101+
cov_sum[name] = (int(end) - int(start)) * int(coverage)
102+
sorted_d = sorted(cov_sum.items(), key=operator.itemgetter(1), reverse=True)
103+
print(sorted_d[0])
104+
105+
f.close()
106+
107+
108+
109+
110+
if __name__=="__main__":
111+
example_text = '''example:
112+
### example to run the bedtools intersection for all bed files with the nr3c1exon.gtf file
113+
get_near_ref.py -r ref.fasta -f read.fastq > near1.fasta
114+
'''
115+
116+
parser = argparse.ArgumentParser(prog='runpara',
117+
description='Run bash cmd lines for files with the same surfix',
118+
epilog=example_text,
119+
formatter_class=argparse.RawDescriptionHelpFormatter)
120+
121+
parser.add_argument("-f", "--file", help="the fastq file")
122+
parser.add_argument("-r", "--reference", help="the reference file")
123+
parser.add_argument("-c", "--core", help="the core", default= 32)
124+
125+
args = parser.parse_args()
126+
127+
ref_bed=wrapper_run_get_bedfile(args.reference, args.file, args.core)
128+
bed_parser_get_higest_coverage(ref_bed)
129+
130+

runiter.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,7 @@ def get_file_round(wkdir=None, key1="round1"):
6161
wkdir=os.getcwd()
6262
os.chdir(wkdir)
6363
key=key1.replace("1", "")
64-
6564
file_name_l=glob("*"+key+"*")
66-
print(file_name_l)
6765
if file_name_l>1:
6866
num_l=[]
6967
for filename in file_name_l:

tutorials/Ecoli.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
### The survey for the L and S phase modification change using nanopore reads
2+
#### 1. samples
3+
1. 20201124_L: E.coli K12 in L phase
4+
2. 20201124_S: E.coli K12 in S phase
5+
#### 2. basecalling using high accurate guppy basecaller (V.4.1.2, report to be 94% for RNA reads)
6+
Note: This is suggested to run only with GPU server (the server used to run the sequencing has a GPU)
7+
```bash
8+
guppy_basecaller -i . \
9+
-r -c rna_r9.4.1_70bps_hac.cfg \
10+
-s ./fastq_guppy42 \
11+
--builtin_scripts 1 \
12+
-x auto
13+
```
14+
#### 3. read summary
15+
1. read number: 339715 (L); 68102 (S)
16+
2. read yield: 63M (L); 10 M (S)
17+
3. read distribution peaked at around 90 and 150 nt
18+
![read length distribution (bin=10 bp)](.\figs\Ecoli_lenplot.svg){:height="200px" width="200px"}
19+
20+
#### 4. read mapping
21+
1. reference used: GCF_000005845.2_ASM584v2 from [NCBI genome portal](https://www.ncbi.nlm.nih.gov/genome/167?genome_assembly_id=161521) or [NCBI FTP](https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/)
22+
2. mapping is done by using [minimap2](https://github.com/lh3/minimap2) and count is done by htseq-count
23+
```
24+
minimap2 -ax map-ont -t 32 \
25+
GCF_000005845.2_ASM584v2_genomic.fna \
26+
20201124_L.fastq >L.sam
27+
htseq-count --format=sam --stranded="no" \
28+
L.sam \
29+
GCF_000005845.2_ASM584v2_genomic.gff \
30+
--type=gene --idattr=Name >L.count
31+
```
32+
3. check the expression with S.count and L.count (count2.xlsx)
33+
34+
Term | L |S
35+
------------ | -------------|-------------
36+
__no_feature | 133 | 2
37+
__ambiguous | 1557| 38
38+
__too_low_aQual |503283| 32634
39+
__not_aligned |241147 | 61307
40+
__alignment_not_unique |0 | 0
41+
mapped to rRNAs |
42+
43+
44+
#### 5. The RNA methylation calling
45+
halted, tombo run finsihed, megalodon run(self model) finished with warning.
46+
epinano/nanocompare failed

tutorials/MDA.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
### This is a tutorial for sequencing MDA WGA DNA in Nanopore
2+
The raw data was basecalled using guppy 4.0.2, with R9.4.1 hac mode with average accuracy of 95%
3+
4+
1. Draw a histogram for the length distribution
5+
```R
6+
faSize
7+
```
8+

tutorials/genomeqc.md

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
## The code and protocol used for cell line's genome
2+
#### 0. input files
3+
FDSW202461999-1r_L2_1.fq.gz
4+
FDSW202461999-1r_L3_1.fq.gz
5+
FDSW202461999-1r_L2_2.fq.gz
6+
FDSW202461999-1r_L3_2.fq.gz
7+
8+
#### 1. create an conda enviroument to run the code
9+
conda create -n genomeqc
10+
conda activate genomeqc
11+
##### 2. softwares needed to be installed using "conda install xxxx"
12+
- fastqc: the quality control for fastq file
13+
- trimmomatic: trim adapter and low-quality region from reads
14+
- jellyfish: kmer counting # note the name is "kmer-jellyfish" conda install -c bioconda kmer-jellyfish
15+
- spades: a genome assembler for NGS read, same function as soap-assembler used by the company
16+
17+
#### 3. merge data from different lanes
18+
zcat FDSW202461999-1r_L2_1.fq.gz FDSW202461999-1r_L3_1.fq.gz > merge_1.fq
19+
zcat FDSW202461999-1r_L2_2.fq.gz FDSW202461999-1r_L3_2.fq.gz > merge_2.fq
20+
#### 3.1 QC for fastq (optional)
21+
## check the html output
22+
fastqc merge_1.fq
23+
fastqc merge_2.fq
24+
25+
#### 4.Run trimmomatic
26+
##### 4.1 Create a the adaptor fasta file for trimming, use the trim_p_1 and trim_p_2 files for further analysis
27+
```
28+
# adapter.fa
29+
>p7_7UDI501
30+
GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGATCAATCTCGTATGCCGTCTTCTGCTTG
31+
>p5
32+
AGATCGGAAGAGCGTCGTGTAGGGAAAGA
33+
```
34+
##### 4.2 run trim using the adapter sequences
35+
trimmomatic PE -threads 32 -phred33 merge_1.fq merge_2.fq \
36+
trim_p_1.fq trim_u_1.fq trim_p_2.fq trim_u_2.fq \
37+
ILLUMINACLIP:adapter.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
38+
39+
#### 5. Run jellyfish with differnt kmers
40+
```
41+
##### k21
42+
jellyfish count -C -m 21 -s 1000000000 -t 10 trim_p*.fq -o read_p.jf
43+
jellyfish histo -t 10 read_p.jf > 21mer.histo
44+
##### k17
45+
jellyfish count -C -m 17 -s 1000000000 -t 10 trim_p*.fq -o read.jf
46+
jellyfish histo -t 10 read.jf > read_k17.histo
47+
##### k26
48+
jellyfish count -C -m 26 -s 1000000000 -t 10 trim_p*.fq -o read.jf
49+
jellyfish histo -t 10 read.jf > read_k26.histo
50+
```
51+
52+
#### 6. draw the kmer figures to estimate the genome size
53+
```R
54+
### run this code in a small windows machine is still OK, just download these .histo file to local machine
55+
library(ggplot2)
56+
library(dplyr)
57+
58+
df17=read.csv("read_k17.histo", sep=" ", header = F)
59+
df17$size="17mer"
60+
df21=read.csv("read_k21.histo", sep=" ", header = F)
61+
df21$size="21mer"
62+
df26=read.csv("read_k26.histo", sep=" ", header = F)
63+
df26$size="26mer"
64+
65+
df=rbind(df17, df22, df26)
66+
67+
# full plot
68+
pdf("kmer.pdf", width=12, height=12)
69+
ggplot(data=df)+geom_point(aes(x=V1, y=V2, color=size))+xlim(0,500)+ylim(0, 1e7)+theme_bw()+xlab("Kmer number")+ylab("Count")
70+
dev.off()
71+
72+
df_max=filter(df, V1>100) %>% group_by(size) %>%
73+
filter(V2 == max(V2)) %>%
74+
summarise(V1)
75+
76+
df_start=filter(df, V1>5&V1<80) %>% group_by(size) %>%
77+
filter(V2 == min(V2)) %>%
78+
summarise(V1)
79+
80+
# genome size
81+
for (i in c(1,2,3)) {
82+
kmer=as.character(df_max[i,1])
83+
#print(as.character(kmer))
84+
start=as.numeric(filter(df_start, size==kmer)$V1)
85+
max=as.numeric(filter(df_max, size==kmer)$V1)
86+
#print(c(start, end, max, kmer))
87+
df_one=filter(df, size==kmer)
88+
end=as.numeric(dim(df_one)[1])
89+
sumall=sum(as.numeric(df_one$V1[start:end]*df_one$V2[start:end]))/max
90+
print(c(kmer,sumall/1000000))
91+
}
92+
### output for the diploid genome size
93+
### "17mer" "583 Gb"
94+
### "21mer" "565.Gb"
95+
### "26mer" "543.Gb"
96+
```
97+
98+
#### 7. run spades assembler to get the genome
99+
##### please check for cmd detail http://sepsis-omics.github.io/tutorials/modules/spades_cmdline/
100+
spades.py --threads 48 --memory 1400 -1 trim_p_1.fq.gz -2 trim_p_2.fq.gz --careful --cov-cutoff auto -o spadesout
101+
102+
#### 8. Simple parameters for genome
103+
N50: 6.8 Kb
104+
Largest contig: 540 Kb
105+
Total size: 720 Mb (larger than estimattion, indicate insufficient merging of repeats)
106+
Busco score : 87% (busco4)
107+
**This genome assembly from spades is just used for survey. A real genome should come from the long reads assembler polished by these short reads.**

tutorials/read.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
#### The tutorials used for teaching and training of bioinfor newcomers

0 commit comments

Comments
 (0)