add new

Runsheng · Runsheng · commit 62b5222afb4f · 2021-03-04T17:51:10.000+08:00
diff --git a/faSize.py b/faSize.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 12/3/2020 3:42 PM
+# @Author  : Runsheng     
+# @File    : faSize.py
+"""
+mimic the faSize -detailed result using UCSC kentutils
+"""
+import argparse
+from Bio import SeqIO
+
+
+def fa_size(fastafile, filetype="fastq"):
+    """
+    Give a fasta file name, return a dict contains the name and seq
+    Require Biopython SeqIO medule to parse the sequence into dict, a large genome may take a long time to parser
+    """
+    len_list=[]
+    handle=open(fastafile, "r")
+    for contig in SeqIO.parse(handle,filetype):
+        name=contig.name
+        len_list.append( (name, len(contig)) )
+    handle.close()
+    return len_list
+
+
+def write_len(len_list, filename="out.sizes"):
+    fw=open(filename, "w")
+    for i in len_list:
+        name, length=i
+        fw.write(name+"\t"+str(length)+"\n")
+    fw.close()
+
+
+if __name__=="__main__":
+
+    example_text = '''example:
+        ### example to run the faSize 
+        faSize.py --file file.fa --filetype fasta --output file.fa.sizes
+        '''
+
+    parser = argparse.ArgumentParser(prog='faSize',
+                                     epilog=example_text,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument("-f", "--file", help="input file in fasta/fastq format")
+    parser.add_argument("-o", "--output", help="the two column text file for name:size")
+    parser.add_argument("-t", "--filetype", help="the file is fastq or fasta", default="fasta")
+    args = parser.parse_args()
+
+    #main
+    len_l=fa_size(fastafile=args.file, filetype=args.filetype)
+    write_len(len_list=len_l, filename=args.output)
+
+
diff --git a/fa_merge.py b/fa_merge.py
@@ -113,3 +113,6 @@ def dic2fasta(record_dict,out="record_dict.fasta"):
 
 
 
+
+
+
diff --git a/get_near_ref.py b/get_near_ref.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# @Time    : 3/2/2021 5:32 PM
+# @Author  : Runsheng     
+# @File    : get_near_ref.py
+"""
+from mutiple references, get the nearest reference for further polish
+mostly used for RNA virus reference choosing
+"""
+
+from __future__ import print_function
+import os
+import argparse
+import subprocess
+import logging
+import sys
+import signal
+from Bio import SeqIO
+import gzip
+import operator
+from collections import OrderedDict
+
+
+def myexe(cmd, timeout=0):
+    """
+    a simple wrap of the shell
+    mainly used to run the bwa mem mapping and samtool orders
+    """
+    def setupAlarm():
+        signal.signal(signal.SIGALRM, alarmHandler)
+        signal.alarm(timeout)
+
+    def alarmHandler(signum, frame):
+        sys.exit(1)
+
+    proc=subprocess.Popen(cmd, shell=True, preexec_fn=setupAlarm,
+                          stdout=subprocess.PIPE, stderr=subprocess.PIPE,cwd=os.getcwd())
+    out, err=proc.communicate()
+    print(err)
+    return out, err, proc.returncode
+
+
+def fastq2dic(fastqfile):
+    """
+    Give a fastq file name, return a dict contains the name and seq
+    Require Biopython SeqIO medule to parse the sequence into dict, a large readfile may take a lot of RAM
+    """
+    if ".gz" in fastqfile:
+        handle=gzip.open(fastqfile, "rU")
+    else:
+        handle=open(fastqfile, "rU")
+    record_dict=SeqIO.to_dict(SeqIO.parse(handle, "fastq"))
+    handle.close()
+    return record_dict
+
+
+def chr_select(record_dict, chro):
+    """
+    Note the start and end is 0 based
+    give the name of refdic, and the chr, start and end to be used
+    return the name and sequence (both as str)
+    for example, chrcut(record_dict, "I", 100,109) returns
+     ("I:100_109","AAAAAAAAAA")
+    """
+    name=record_dict[chro].name
+    seq=str(record_dict[chro].seq)
+    return name,seq
+
+
+def wrapper_run_get_bedfile(ref, fastq, core=32):
+
+    cmd_minimap2="""
+    minimap2 -ax map-ont -t {core} {ref} {fastq} > map.sam
+    samtools view -F 4 -b map.sam > map.bam
+    samtools sort map.bam > maps.bam
+    bedtools genomecov -ibam maps.bam -bga > {ref}.bed 
+    rm map.sam
+    rm map.bam
+    """.format(ref=ref, fastq=fastq, core=core)
+    print(cmd_minimap2)
+    print(myexe(cmd_minimap2))
+
+    return ref+".bed"
+
+
+def bed_parser_get_higest_coverage(bedfile):
+    """
+    parser the bed file and get the refname which has higest coverage
+    :param bedfile:
+    :return:
+    """
+    cov_sum={}
+
+    f=open(bedfile, "r")
+    for line in f.readlines():
+        line_l=line.strip().split("\t")
+        name, start, end, coverage=line_l
+        try:
+            cov_sum[name]+=(int(end)-int(start)) * int(coverage)
+        except KeyError:
+            cov_sum[name] = (int(end) - int(start)) * int(coverage)
+    sorted_d = sorted(cov_sum.items(), key=operator.itemgetter(1), reverse=True)
+    print(sorted_d[0])
+
+    f.close()
+    
+
+
+
+if __name__=="__main__":
+    example_text = '''example:
+        ### example to run the bedtools intersection for all bed files with the nr3c1exon.gtf file
+        get_near_ref.py -r ref.fasta -f read.fastq > near1.fasta 
+        '''
+
+    parser = argparse.ArgumentParser(prog='runpara',
+                                     description='Run bash cmd lines for files with the same surfix',
+                                     epilog=example_text,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    parser.add_argument("-f", "--file", help="the fastq file")
+    parser.add_argument("-r", "--reference", help="the reference file")
+    parser.add_argument("-c", "--core", help="the core", default= 32)
+
+    args = parser.parse_args()
+
+    ref_bed=wrapper_run_get_bedfile(args.reference, args.file, args.core)
+    bed_parser_get_higest_coverage(ref_bed)
+
+
diff --git a/runiter.py b/runiter.py
@@ -61,9 +61,7 @@ def get_file_round(wkdir=None, key1="round1"):
         wkdir=os.getcwd()
     os.chdir(wkdir)
     key=key1.replace("1", "")
-
     file_name_l=glob("*"+key+"*")
-    print(file_name_l)
     if file_name_l>1:
         num_l=[]
         for filename in file_name_l:
diff --git a/tutorials/Ecoli.md b/tutorials/Ecoli.md
@@ -0,0 +1,46 @@
+### The survey for the L and S phase modification change using nanopore reads
+#### 1. samples
+    1. 20201124_L: E.coli K12 in L phase
+    2. 20201124_S: E.coli K12 in S phase
+#### 2. basecalling using high accurate guppy basecaller (V.4.1.2, report to be 94% for RNA reads)
+Note: This is suggested to run only with GPU server (the server used to run the sequencing has a GPU)
+```bash
+guppy_basecaller -i . \
+    -r -c rna_r9.4.1_70bps_hac.cfg \
+    -s  ./fastq_guppy42 \
+    --builtin_scripts 1 \
+    -x auto
+```
+#### 3. read summary
+   1. read number: 339715 (L); 68102 (S)
+   2. read yield: 63M (L); 10 M (S)
+   3. read distribution peaked at around 90 and 150 nt 
+      ![read length distribution (bin=10 bp)](.\figs\Ecoli_lenplot.svg){:height="200px" width="200px"}
+
+#### 4. read mapping
+   1. reference used: GCF_000005845.2_ASM584v2 from [NCBI genome portal](https://www.ncbi.nlm.nih.gov/genome/167?genome_assembly_id=161521) or [NCBI FTP](https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/)
+   2. mapping is done by using [minimap2](https://github.com/lh3/minimap2) and count is done by htseq-count
+        ```
+        minimap2 -ax map-ont -t 32  \
+            GCF_000005845.2_ASM584v2_genomic.fna  \
+            20201124_L.fastq >L.sam
+        htseq-count --format=sam --stranded="no" \
+            L.sam \
+            GCF_000005845.2_ASM584v2_genomic.gff \
+            --type=gene --idattr=Name >L.count
+        ```
+   3. check the expression with S.count and L.count (count2.xlsx)
+
+Term | L |S
+------------ | -------------|-------------
+__no_feature  |  133 | 2
+__ambiguous    | 1557| 38
+__too_low_aQual |503283| 32634
+__not_aligned   |241147 | 61307
+__alignment_not_unique |0 | 0
+mapped to rRNAs | 
+
+
+#### 5. The RNA methylation calling 
+halted, tombo run finsihed, megalodon run(self model) finished with warning. 
+epinano/nanocompare failed 
diff --git a/tutorials/MDA.md b/tutorials/MDA.md
@@ -0,0 +1,8 @@
+### This is a tutorial for sequencing MDA WGA DNA in Nanopore
+The raw data was basecalled using guppy 4.0.2, with R9.4.1 hac mode with average accuracy of 95%
+
+1. Draw a histogram for the length distribution 
+```R
+faSize 
+```
+
diff --git a/tutorials/genomeqc.md b/tutorials/genomeqc.md
@@ -0,0 +1,107 @@
+## The code and protocol used for cell line's genome
+#### 0. input files
+    FDSW202461999-1r_L2_1.fq.gz
+    FDSW202461999-1r_L3_1.fq.gz
+    FDSW202461999-1r_L2_2.fq.gz
+    FDSW202461999-1r_L3_2.fq.gz
+    
+#### 1. create an conda enviroument  to run the code
+    conda create -n genomeqc
+    conda activate genomeqc
+##### 2. softwares needed to be installed using "conda install xxxx"
+- fastqc: the quality control for fastq file
+- trimmomatic: trim adapter and low-quality region from reads
+- jellyfish: kmer counting # note the name is "kmer-jellyfish" conda install -c bioconda kmer-jellyfish
+- spades: a genome assembler for NGS read, same function as soap-assembler used by the company
+
+#### 3.  merge data from different lanes
+    zcat FDSW202461999-1r_L2_1.fq.gz FDSW202461999-1r_L3_1.fq.gz > merge_1.fq
+    zcat FDSW202461999-1r_L2_2.fq.gz FDSW202461999-1r_L3_2.fq.gz > merge_2.fq
+#### 3.1 QC for fastq (optional)
+    ## check the html output 
+    fastqc merge_1.fq
+    fastqc merge_2.fq
+
+#### 4.Run trimmomatic
+##### 4.1 Create a the adaptor fasta file for trimming, use the trim_p_1 and trim_p_2 files for further analysis
+```
+# adapter.fa
+>p7_7UDI501
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGATCAATCTCGTATGCCGTCTTCTGCTTG
+>p5
+AGATCGGAAGAGCGTCGTGTAGGGAAAGA
+```
+##### 4.2 run trim using the adapter sequences
+    trimmomatic PE -threads 32 -phred33 merge_1.fq merge_2.fq \
+	    trim_p_1.fq trim_u_1.fq trim_p_2.fq trim_u_2.fq \
+	    ILLUMINACLIP:adapter.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
+	    
+#### 5. Run jellyfish with differnt kmers
+```
+##### k21
+jellyfish count -C -m 21 -s 1000000000 -t 10 trim_p*.fq -o read_p.jf
+jellyfish histo -t 10 read_p.jf > 21mer.histo
+##### k17
+jellyfish count -C -m 17 -s 1000000000 -t 10 trim_p*.fq -o read.jf
+jellyfish histo -t 10 read.jf > read_k17.histo
+##### k26
+jellyfish count -C -m 26 -s 1000000000 -t 10 trim_p*.fq -o read.jf
+jellyfish histo -t 10 read.jf > read_k26.histo
+```
+
+#### 6. draw the kmer figures to estimate the genome size
+```R
+### run this code in a small windows machine is still OK, just download these .histo file to local machine
+library(ggplot2)
+library(dplyr)
+
+df17=read.csv("read_k17.histo", sep=" ", header = F)
+df17$size="17mer"
+df21=read.csv("read_k21.histo", sep=" ", header = F)
+df21$size="21mer"
+df26=read.csv("read_k26.histo", sep=" ", header = F)
+df26$size="26mer"
+
+df=rbind(df17, df22, df26)
+
+# full plot
+pdf("kmer.pdf", width=12, height=12)
+ggplot(data=df)+geom_point(aes(x=V1, y=V2, color=size))+xlim(0,500)+ylim(0, 1e7)+theme_bw()+xlab("Kmer number")+ylab("Count")
+dev.off()
+
+df_max=filter(df, V1>100) %>% group_by(size) %>%
+  filter(V2 == max(V2)) %>%
+  summarise(V1)
+
+df_start=filter(df, V1>5&V1<80) %>% group_by(size) %>%
+  filter(V2 == min(V2)) %>%
+  summarise(V1)
+
+# genome size
+for (i in c(1,2,3)) {
+  kmer=as.character(df_max[i,1])
+  #print(as.character(kmer))
+  start=as.numeric(filter(df_start, size==kmer)$V1)
+  max=as.numeric(filter(df_max, size==kmer)$V1)
+  #print(c(start, end, max, kmer))
+  df_one=filter(df, size==kmer)
+  end=as.numeric(dim(df_one)[1])
+  sumall=sum(as.numeric(df_one$V1[start:end]*df_one$V2[start:end]))/max
+  print(c(kmer,sumall/1000000))
+}
+### output for the diploid genome size
+### "17mer"            "583 Gb"
+### "21mer"            "565.Gb"
+### "26mer"            "543.Gb"
+```
+
+#### 7. run spades assembler to get the genome
+##### please check for cmd detail http://sepsis-omics.github.io/tutorials/modules/spades_cmdline/
+spades.py --threads 48 --memory 1400 -1 trim_p_1.fq.gz -2 trim_p_2.fq.gz --careful --cov-cutoff auto -o spadesout
+
+#### 8. Simple parameters for genome
+    N50: 6.8 Kb
+    Largest contig: 540 Kb
+    Total size: 720 Mb (larger than estimattion, indicate insufficient merging of repeats)
+    Busco score : 87% (busco4) 
+**This genome assembly from spades is just used for survey. A real genome should come from the long reads assembler polished by these short reads.**
diff --git a/tutorials/read.md b/tutorials/read.md
@@ -0,0 +1 @@
+#### The tutorials used for teaching and training of bioinfor newcomers

Original file line number	Diff line number	Diff line change
`@@ -113,3 +113,6 @@ def dic2fasta(record_dict,out="record_dict.fasta"):`
`113`	`113`
`114`	`114`
`115`	`115`
	`116`	`+`
	`117`	`+`
	`118`	`+`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+#### The tutorials used for teaching and training of bioinfor newcomers`