-
Notifications
You must be signed in to change notification settings - Fork 3
annotate_my_genomes benchmarking
We benchmarked annotate_my_genomes performance against:
- BRAKER1 and BRAKER2 (https://github.com/Gaius-Augustus/BRAKER)
- TSEBRA (https://github.com/Gaius-Augustus/TSEBRA)
- Augustus (https://github.com/Gaius-Augustus/Augustus)
At January 2022, each program was tested on Ubuntu 16.04.7 LTS (xenial) and installed as follows:
See here: https://github.com/cfarkas/annotate_my_genomes#ii-installation
In my machine, ~
equals /home/wslab
### Create conda environment
conda config --add channels conda-forge
conda config --add channels bioconda
conda create -n braker2 braker2
conda activate braker2
### Dependencies
conda install -c anaconda perl
conda install -c bioconda perl-app-cpanminus
conda install -c bioconda perl-hash-merge
conda install -c bioconda perl-parallel-forkmanager
conda install -c bioconda perl-scalar-util-numeric
conda install -c bioconda perl-yaml
conda install -c bioconda perl-class-data-inheritable
conda install -c bioconda perl-exception-class
conda install -c bioconda perl-test-pod
conda install -c anaconda biopython
conda install -c bioconda perl-file-which # skip if you are not comparing to reference annotation
conda install -c bioconda perl-mce
conda install -c bioconda perl-threaded
conda install -c bioconda perl-list-util
conda install -c bioconda perl-math-utils
conda install -c bioconda cdbtools
### The following Perl modules are required:
sudo cpanm YAML
sudo cpanm Hash::Merge
sudo cpanm Parallel::ForkManager
sudo cpanm MCE::Mutex
sudo cpanm Thread::Queue
sudo cpanm threads
### clone repository
git clone https://github.com/Gaius-Augustus/BRAKER.git
### Unpack gmes_linux_64_4.tar.gz and and place gm_key_64.gz inside /BRAKER/gmes_linux_64_4/ folder and:
cd ~/BRAKER/gmes_linux_64_4
rm gm_key
gunzip gm_key_64.gz # personal user key obtained from here: http://exon.gatech.edu/GeneMark/license_download.cgi
mv gm_key_64 gm_key
cp gm_key ~/.gm_key
# check install as follows:
bash check_install.bash
### Installing AUGUSTUS
cd ~/BRAKER/
# This program requires cmake
conda install -c anaconda cmake
# root
sudo -i
apt-get update
apt-get install build-essential wget git autoconf
# Install dependencies for AUGUSTUS comparative gene prediction mode (CGP)
apt-get install libgsl-dev libboost-all-dev libsuitesparse-dev liblpsolve55-dev
apt-get install libsqlite3-dev libmysql++-dev
# Install dependencies for the optional support of gzip compressed input files
apt-get install libboost-iostreams-dev zlib1g-dev
# Install dependencies for bam2hints and filterBam
apt-get install libbamtools-dev zlib1g-dev
# Install additional dependencies for bam2wig
apt-get install samtools libhts-dev
# Install additional dependencies for homGeneMapping and utrrnaseq
apt-get install libboost-all-dev
# Install additional dependencies for scripts
apt-get install cdbfasta diamond-aligner libfile-which-perl libparallel-forkmanager-perl libyaml-perl libdbd-mysql-perl
apt-get install --no-install-recommends python3-biopython
# make Augustus
# Until Ubuntu 21.04: sudo apt install augustus augustus-data augustus-doc
cd ~/BRAKER/
git clone https://github.com/Gaius-Augustus/Augustus.git
cd Augustus
make
### To run BRAKER2, export the following paths
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/Augustus/config/
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
### To Run /BRAKER/scripts/braker.pl, users will need Scalar::Util::Numeric perl module installed.
This module can be installed as follows:
cpanm Scalar::Util::Numeric
or
conda install -c bioconda perl-scalar-util-numeric
# See here: https://github.com/Gaius-Augustus/BRAKER/blob/master/docs/long_reads/long_read_protocol.md
## Clone BRAKER in $HOME (e.g.: /home/wslab/)
conda activate braker2 # from BRAKER2 install
cd
git clone https://github.com/Gaius-Augustus/BRAKER
cd BRAKER
git checkout long_reads
export PATH="/home/wslab/BRAKER/scripts:$PATH"
## GeneMarkS-T
place GeneMarkS-T inside BRAKER folder and:
mkdir GMST
mv gm_key_64.gz gmst_linux_64.tar.gz ./GMST
cd GMST/
gunzip gm_key_64.gz gmst_linux_64.tar.gz
tar -xvf gmst_linux_64.tar
rm gm_key
mv gm_key_64 gm_key
# test
perl gmst.pl test.fa
export PATH="/home/wslab/BRAKER/GMST/gmst.pl:$PATH"
## TSEBRA
cd
git clone https://github.com/Gaius-Augustus/TSEBRA
cd TSEBRA
git checkout long_reads
export PATH="/home/wslab/TSEBRA/bin:$PATH"
We tested each pipeline on the following datasets:
### PacBio data (IsoSeq)
mkdir PacBio_galGal6 && cd PacBio_galGal6
# HH23 all transcripts
wget -O m54027_190807_082031.subreads.bam ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR388/ERR3887439/m54027_190807_082031.subreads.bam
# HH23 >4kb_transcripts_enrichment
wget -O m54027_191028_202826.subreads.bam ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR388/ERR3887440/m54027_191028_202826.subreads.bam
# HH30 all transcripts
wget -O m54027_190808_044316.subreads.bam ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR388/ERR3887441/m54027_190808_044316.subreads.bam
# HH30 >4kb_transcripts_enrichment
wget -O m54027_191029_165345.subreads.bam ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR388/ERR3887442/m54027_191029_165345.subreads.bam
### Illumina data (paired-end)
# HH23 rep1
wget -O 41-A3_S1_R1_001.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR388/ERR3887419/41-A3_S1_R1_001.fastq.gz
wget -O 41-A3_S1_R2_001.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR388/ERR3887419/41-A3_S1_R2_001.fastq.gz
# HH23 rep2
wget -O 41-B3_S2_R1_001.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR388/ERR3887420/41-B3_S2_R1_001.fastq.gz
wget -O 41-B3_S2_R2_001.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR388/ERR3887420/41-B3_S2_R2_001.fastq.gz
# HH30 rep1
wget -O 71-A4_S3_R1_001.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR388/ERR3887421/71-A4_S3_R1_001.fastq.gz
wget -O 71-A4_S3_R2_001.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR388/ERR3887421/71-A4_S3_R2_001.fastq.gz
# HH30 rep2
wget -O 71-B4_S4_R1_001.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR388/ERR3887422/71-B4_S4_R1_001.fastq.gz
wget -O 71-B4_S4_R2_001.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR388/ERR3887422/71-B4_S4_R2_001.fastq.gz
mkdir PacBio_Mus_musculus && cd PacBio_Mus_musculus
### PacBio data (IsoSeq)
# GSM4118746: C57/BL6J oocyte cells pacbio; Mus musculus; RNA-Seq
wget -O m54180_190618_225637.subreads.bam https://sra-pub-src-2.s3.amazonaws.com/SRR10267009/m54180_190618_225637.subreads.bam
### Illumina (paired-end)
prefetch --max-size 800G -O ./ SRR10266994 # GSM4118734: C57/BL6J oocyte cells rep2; Mus musculus; RNA-Seq
prefetch --max-size 800G -O ./ SRR10266993 # GSM4118733: C57/BL6J oocyte cells rep1; Mus musculus; RNA-Seq
prefetch --max-size 800G -O ./ SRR10266992 # GSM4118733: C57/BL6J oocyte cells rep1; Mus musculus; RNA-Seq
fastq-dump --split-files SRR10266992.sra
fastq-dump --split-files SRR10266993.sra
fastq-dump --split-files SRR10266994.sra
mkdir PacBio_Homo_sapiens && cd PacBio_Homo_sapiens
# PacBio data (IsoSeq)
prefetch --max-size 800G -O ./ SRR12933206 # HAP1 Iso-Seq; Homo sapiens; RNA-Seq
fastq-dump SRR12933206.sra
### Illumina (paired-end)
prefetch --max-size 800G -O ./ SRR14338441 # HAP1_wildtype_1; Homo sapiens; RNA-Seq
prefetch --max-size 800G -O ./ SRR14338442 # HAP1_wildtype_2; Homo sapiens; RNA-Seq
prefetch --max-size 800G -O ./ SRR14338443 # HAP1_wildtype_3; Homo sapiens; RNA-Seq
prefetch --max-size 800G -O ./ SRR14338444 # HAP1_wildtype_4; Homo sapiens; RNA-Seq
fastq-dump --split-files SRR14338441.sra
fastq-dump --split-files SRR14338442.sra
fastq-dump --split-files SRR14338443.sra
fastq-dump --split-files SRR14338444.sra
#
mkdir PacBio_Danio_rerio && cd PacBio_Danio_rerio
### PacBio data (fastq files)
prefetch --max-size 800G -O ./ SRR5865381 # GSM2717135: long-reads preZGA; Danio rerio; RNA-Seq
prefetch --max-size 800G -O ./ SRR5865382 # GSM2717136: long-reads postZGA; Danio rerio; RNA-Seq
fastq-dump SRR5865381.sra
fastq-dump SRR5865382.sra
cat SRR5865381.fastq.gz SRR5865382.fastq.gz > long_reads.fastq.gz
gunzip long_reads.fastq.gz
### illumina data (paired-end)
prefetch --max-size 800G -O ./ SRR5865383 # GSM2717137: short-reads preZGA; Danio rerio; RNA-Seq
prefetch --max-size 800G -O ./ SRR5865384 # GSM2717138: short-reads postZGA; Danio rerio; RNA-Seq
fastq-dump --split-files SRR5865383.sra
fastq-dump --split-files SRR5865384.sra
### PacBio data (fastq files)
wget -O L1_rep1.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR3245464/L1_rep1.fastq.gz
wget -O L1_rep2.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR3245465/L1_rep2.fastq.gz
wget -O L2_rep1.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR3245466/L2_rep1.fastq.gz
wget -O L2_rep2.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR3245467/L2_rep2.fastq.gz
wget -O L3_rep1.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR3245468/L3_rep1.fastq.gz
wget -O L3_rep2.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR3245469/L3_rep2.fastq.gz
wget -O L4_rep1.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR3245470/L4_rep1.fastq.gz
wget -O L4_rep2.fastq.gz ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR324/ERR3245471/L4_rep2.fastq.gz
We processed and analyzed sequencing datasets per species as follows:
# Directory
cd PacBio_galGal6
# Genome (download with genome-download program from annotate_my_genomes)
git clone https://github.com/cfarkas/annotate_my_genomes.git # clone repository, install if necessary.
conda activate annotate_my_genomes
genome-download galGal6
###########################
### Illumina alignments ###
###########################
# fastp install
conda install -c bioconda fastp
# Trimming of illumina fastq datasets
fastp -w 16 -i 41-A3_S1_R1_001.fastq.gz -I 41-A3_S1_R2_001.fastq.gz -o 41-A3_S1_R1_001.fq.gz -O 41-A3_S1_R2_001.fq.gz
fastp -w 16 -i 41-B3_S2_R1_001.fastq.gz -I 41-B3_S2_R2_001.fastq.gz -o 41-B3_S2_R1_001.fq.gz -O 41-B3_S2_R2_001.fq.gz
fastp -w 16 -i 71-A4_S3_R1_001.fastq.gz -I 71-A4_S3_R2_001.fastq.gz -o 71-A4_S3_R1_001.fq.gz -O 71-A4_S3_R2_001.fq.gz
fastp -w 16 -i 71-B4_S4_R1_001.fastq.gz -I 71-B4_S4_R2_001.fastq.gz -o 71-B4_S4_R1_001.fq.gz -O 71-B4_S4_R2_001.fq.gz
# build hisat2 index with 30 threads
hisat2-build galGal6.fa -p 30 galGal6.fa
# align short illumina reads agains galGal6, using 30 threads
hisat2 -x galGal6.fa -p 50 -1 41-A3_S1_R1_001.fq.gz -2 41-A3_S1_R2_001.fq.gz | samtools view -bS - > 41.bam
hisat2 -x galGal6.fa -p 50 -1 41-B3_S2_R1_001.fq.gz -2 41-B3_S2_R2_001.fq.gz | samtools view -bS - > 42.bam
hisat2 -x galGal6.fa -p 50 -1 71-A4_S3_R1_001.fq.gz -2 71-A4_S3_R2_001.fq.gz | samtools view -bS - > 71.bam
hisat2 -x galGal6.fa -p 50 -1 71-B4_S4_R1_001.fq.gz -2 71-B4_S4_R2_001.fq.gz | samtools view -bS - > 72.bam
# sorting bam files
samtools sort 41.bam -@ 50 > 41.sorted.bam
samtools sort 42.bam -@ 50 > 42.sorted.bam
samtools sort 71.bam -@ 50 > 71.sorted.bam
samtools sort 72.bam -@ 50 > 72.sorted.bam
# index bam files
samtools index 41.sorted.bam -@ 15 && rm 41.bam
samtools index 42.sorted.bam -@ 15 && rm 42.bam
samtools index 71.sorted.bam -@ 15 && rm 71.bam
samtools index 72.sorted.bam -@ 15 && rm 72.bam
# Merge illumina bam files
samtools merge Illumina_merged.bam 41.sorted.bam 42.sorted.bam 71.sorted.bam 72.sorted.bam -@ 30 -f
samtools sort -o Illumina_merged.sorted.bam Illumina_merged.bam -@ 30
samtools index Illumina_merged.sorted.bam -@ 30
rm Illumina_merged.bam
#############################################
### PacBio standard read length alignment ###
#############################################
samtools bam2fq m54027_190807_082031.subreads.bam > reads_HH23_s.fastq
samtools bam2fq m54027_190808_044316.subreads.bam > reads_HH30_s.fastq
minimap2 -ax splice galGal6.fa reads_HH23_s.fastq > aln_HH23_s.sam -t 30
minimap2 -ax splice galGal6.fa reads_HH30_s.fastq > aln_HH30_s.sam -t 30
# SAM to bam
samtools view -S -b aln_HH23_s.sam > aln_HH23_s.bam --threads 30
samtools sort aln_HH23_s.bam> aln_HH23_s.sorted.bam --threads 30 && samtools index aln_HH23_s.sorted.bam -@ 30
rm aln_HH23_s.sam
samtools view -S -b aln_HH30_s.sam > aln_HH30_s.bam --threads 30
samtools sort aln_HH30_s.bam> aln_HH30_s.sorted.bam --threads 30 && samtools index aln_HH30_s.sorted.bam -@ 30
rm aln_HH30_s.sam
#########################################
### PacBio long read length alignment ###
#########################################
samtools bam2fq m54027_191028_202826.subreads.bam > reads_HH23_l.fastq
samtools bam2fq m54027_191029_165345.subreads.bam > reads_HH30_l.fastq
minimap2 -ax splice galGal6.fa reads_HH23_l.fastq > aln_HH23_l.sam -t 30
minimap2 -ax splice galGal6.fa reads_HH30_l.fastq > aln_HH30_l.sam -t 30
# SAM to bam
samtools view -S -b aln_HH23_l.sam > aln_HH23_l.bam --threads 30
samtools sort aln_HH23_l.bam > aln_HH23_l.sorted.bam --threads 30 && samtools index aln_HH23_l.sorted.bam -@ 30
rm aln_HH23_l.sam
samtools view -S -b aln_HH30_l.sam > aln_HH30_l.bam --threads 30
samtools sort aln_HH30_l.bam > aln_HH30_l.sorted.bam --threads 30 && samtools index aln_HH30_l.sorted.bam -@ 30
rm aln_HH30_l.sam
############################################
### Merge PacBio and illumina alignments ###
############################################
# PacBio
samtools merge 4d_7d_merged.bam aln_HH23_s.sorted.bam aln_HH30_s.sorted.bam aln_HH23_l.sorted.bam aln_HH30_l.sorted.bam -@ 30 --reference galGal6.fa
samtools sort -o 4d_7d_merged.sorted.bam 4d_7d_merged.bam -@ 30 && samtools index 4d_7d_merged.sorted.bam -@ 30
rm 4d_7d_merged.bam
# all
samtools merge PacBio_Illumina_merged.bam 4d_7d_merged.sorted.bam 41.sorted.bam 42.sorted.bam 71.sorted.bam 72.sorted.bam -@ 30 -f
samtools sort -o PacBio_Illumina_merged.sorted.bam PacBio_Illumina_merged.bam -@ 30
samtools index PacBio_Illumina_merged.sorted.bam -@ 30
rm PacBio_Illumina_merged.bam
######################
### GTF generation ###
######################
mkdir illumina_alone
mkdir PacBio+Illumina
### Illumina GTF
stringtie -p 1 -j 2 -c 2 -v -a 4 -o ./illumina_alone/illumina_transcripts.gtf Illumina_merged.sorted.bam
### Illumina + PacBio GTF
stringtie -p 1 -j 2 -c 2 -v -a 4 -o ./PacBio+Illumina/transcripts.gtf PacBio_Illumina_merged.sorted.bam
### Illumina + PacBio GTF guided
stringtie -p 1 -j 2 -c 2 -v -a 4 -o ./PacBio+Illumina/transcripts-guided.gtf -G galGal6_ncbiRefSeq.gtf PacBio_Illumina_merged.sorted.bam
####################################
### annotate_my_genomes pipeline ###
####################################
#### annotate_my_genomes (illumina + PacBio)
conda activate annotate_my_genomes
# set gawn_config.sh with 30 threads
mkdir output1
begin=`date +%s`
add-ncbi-annotation -a transcripts.gtf -n galGal6_ncbiRefSeq.gtf -r galGal6.gtf -g galGal6.fa -c gawn_config.sh -t 30 -o output1
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
#### annotate_my_genomes (illumina + PacBio, guided)
conda activate annotate_my_genomes
# set gawn_config.sh with 30 threads
mkdir output2
begin=`date +%s`
add-ncbi-annotation -a transcripts-guided.gtf -n galGal6_ncbiRefSeq.gtf -r galGal6.gtf -g galGal6.fa -c gawn_config.sh -t 30 -o output2
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
#### annotate_my_genomes (illumina alone)
conda activate annotate_my_genomes
# set gawn_config.sh with 30 threads
mkdir output3
begin=`date +%s`
add-ncbi-annotation -a illumina_transcripts.gtf -n galGal6_ncbiRefSeq.gtf -r galGal6.gtf -g galGal6.fa -c gawn_config.sh -t 30 -o output3
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
########################
### BRAKER2 pipeline ###
########################
#### test1 with: PacBio_Illumina_merged.sorted.bam
mkdir test1_braker2 && cd test1_braker2
conda activate braker2
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
rm -r -f /home/wslab/braker2_install/BRAKER/config/species/Gallus_gallus
braker.pl --verbosity=4 --AUGUSTUS_ab_initio --cores 30 --genome=galGal6.fa --species=Gallus_gallus --bam=PacBio_Illumina_merged.sorted.bam
#### test2 with: Illumina_merged.sorted.bam
mkdir test2_braker2 && cd test2_braker2
conda activate braker2
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
rm -r -f /home/wslab/braker2_install/BRAKER/config/species/Gallus_gallus
braker.pl --verbosity=4 --AUGUSTUS_ab_initio --cores 30 --genome=galGal6.fa --species=Gallus_gallus --bam=Illumina_merged.sorted.bam
#### test3 with Illumina_merged.sorted.bam and Gallus gallus uniprot proteome 9031
mkdir test3_braker2 && cd test3_braker2
# Downloading Gallus gallus proteome 9031
conda activate annotate_my_genomes
perl ./annotate_my_genomes/additional_scripts/download_proteome_uniprot.pl 9031
conda activate braker2
# see and fix this issue with diamond: https://github.com/Gaius-Augustus/BRAKER/issues/430
cd /home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/dependencies/
# downloading diamond tool
wget http://github.com/bbuchfink/diamond/releases/download/v2.0.13/diamond-linux64.tar.gz
gunzip diamond-linux64.tar.gz
tar -xvf diamond-linux64.tar
# RUN
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
export PROTHINT_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/bin/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
echo $PROTHINT_PATH
rm -r -f /home/wslab/braker2_install/BRAKER/config/species/Gallus_gallus
braker.pl --verbosity=4 --AUGUSTUS_ab_initio --cores 30 --genome=galGal6.fa --species=Gallus_gallus --bam=Illumina_merged.sorted.bam
#### test4 with PacBio_Illumina_merged.sorted.bam and Gallus gallus uniprot proteome 9031
mkdir test4_braker2 && cd test4_braker2
# Downloading Gallus gallus proteome 9031
conda activate annotate_my_genomes
perl ./annotate_my_genomes/additional_scripts/download_proteome_uniprot.pl 9031
conda activate braker2
# see and fix this issue with diamond: https://github.com/Gaius-Augustus/BRAKER/issues/430
cd /home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/dependencies/
# downloading diamond tool
wget http://github.com/bbuchfink/diamond/releases/download/v2.0.13/diamond-linux64.tar.gz
gunzip diamond-linux64.tar.gz
tar -xvf diamond-linux64.tar
# RUN
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
export PROTHINT_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/bin/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
echo $PROTHINT_PATH
rm -r -f /home/wslab/braker2_install/BRAKER/config/species/Gallus_gallus
braker.pl --verbosity=4 --AUGUSTUS_ab_initio --cores 30 --genome=galGal6.fa --species=Gallus_gallus --bam=PacBio_Illumina_merged.sorted.bam
# Directory
cd PacBio_Mus_musculus
# Genome (download with genome-download program from annotate_my_genomes)
git clone https://github.com/cfarkas/annotate_my_genomes.git # clone repository, install if necessary.
conda activate annotate_my_genomes
genome-download mm10
###########################
### Illumina alignments ###
###########################
# fastp install
conda install -c bioconda fastp
# Trimming of illumina fastq datasets
fastp -w 16 -i SRR10266992_1.fastq -I SRR10266992_2.fastq -o SRR10266992_1.fq.gz -O SRR10266992_2.fq.gz
fastp -w 16 -i SRR10266993_1.fastq -I SRR10266993_2.fastq -o SRR10266993_1.fq.gz -O SRR10266993_2.fq.gz
fastp -w 16 -i SRR10266994_1.fastq -I SRR10266994_2.fastq -o SRR10266994_1.fq.gz -O SRR10266994_2.fq.gz
# build hisat2 index with 30 threads
hisat2-build mm10.fa -p 30 mm10.fa
# align short illumina reads agains mm10, using 30 threads
hisat2 -x mm10 -p 30 -1 SRR10266992_1.fq.gz -2 SRR10266992_2.fq.gz | samtools view -bS - > SRR10266992.bam
hisat2 -x mm10 -p 30 -1 SRR10266993_1.fq.gz -2 SRR10266993_2.fq.gz | samtools view -bS - > SRR10266993.bam
hisat2 -x mm10 -p 30 -1 SRR10266994_1.fq.gz -2 SRR10266994_2.fq.gz | samtools view -bS - > SRR10266994.bam
########################
### PacBio alignment ###
########################
samtools bam2fq m54180_190618_225637.subreads.bam > reads.fastq
minimap2 -ax splice mm10.fa reads.fastq -t 30 | samtools view -bS - > PacBio.bam
gzip reads.fastq
############################################
### Merge PacBio and illumina alignments ###
############################################
samtools merge merged.bam SRR10266994.bam SRR10266993.bam SRR10266992.bam PacBio.bam -@ 30 -f
samtools sort -o merged.sorted.bam merged.bam -@ 40
samtools index merged.sorted.bam
rm merged.bam
######################
### GTF generation ###
######################
conda activate annotate_my_genomes
mkdir illumina_alone
mkdir PacBio+Illumina
# Obtaining GTF (transcripts.gtf) from merged alignments using StringTie
stringtie -p 1 -v -a 4 -o transcripts.gtf merged.sorted.bam
# Obtaining GTF (transcripts.gtf) from merged alignments using StringTie (Guided)
stringtie -p 1 -v -a 4 -G mm10_ncbiRefSeq.gtf -o transcripts-guided.gtf merged.sorted.bam
####################################
### annotate_my_genomes pipeline ###
####################################
#### annotate_my_genomes (PacBio + Illumina)
conda activate annotate_my_genomes
# set gawn_config.sh with 30 threads
mkdir output1
begin=`date +%s`
add-ncbi-annotation -a transcripts.gtf -n mm10_ncbiRefSeq.gtf -r mm10.gtf -g m10.fa -t 30 -c gawn_config.sh -o output1
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
#### annotate_my_genomes (PacBio + Illumina, guided)
conda activate annotate_my_genomes
# set gawn_config.sh with 30 threads
mkdir output2
begin=`date +%s`
add-ncbi-annotation -a transcripts-guided.gtf -n mm10_ncbiRefSeq.gtf -r mm10.gtf -g mm10.fa -t 30 -c gawn_config.sh -o output2
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
########################
### BRAKER2 pipeline ###
########################
# Downloading Mus musculus proteome 10090
conda activate annotate_my_genomes
perl ./annotate_my_genomes/additional_scripts/download_proteome_uniprot.pl 10090
conda activate braker2
# see and fix this issue with diamond: https://github.com/Gaius-Augustus/BRAKER/issues/430
cd /home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/dependencies/
# downloading diamond tool
wget http://github.com/bbuchfink/diamond/releases/download/v2.0.13/diamond-linux64.tar.gz
gunzip diamond-linux64.tar.gz
tar -xvf diamond-linux64.tar
# RUN
begin=`date +%s`
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
export PROTHINT_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/bin/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
echo $PROTHINT_PATH
rm -r -f /home/wslab/braker2_install/BRAKER/config/species/Mus_musculus
braker.pl --verbosity=4 --AUGUSTUS_ab_initio --cores 30 --etpmode --genome=mm10.fa --species=Mus_musculus --prot_seq=10090.fasta --bam=merged.sorted.bam
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
# Directory
cd PacBio_Homo_sapiens
# Genome (download with genome-download program from annotate_my_genomes)
git clone https://github.com/cfarkas/annotate_my_genomes.git # clone repository, install if necessary.
conda activate annotate_my_genomes
genome-download hg38
###########################
### Illumina alignments ###
###########################
# fastp install
conda install -c bioconda fastp
# Trimming of illumina fastq datasets
fastp -w 16 -i SRR14338441_1.fastq -I SRR14338441_2.fastq -o SRR14338441_1.fq.gz -O SRR14338441_2.fq.gz
fastp -w 16 -i SRR14338442_1.fastq -I SRR14338442_2.fastq -o SRR14338442_1.fq.gz -O SRR14338442_2.fq.gz
fastp -w 16 -i SRR14338443_1.fastq -I SRR14338443_2.fastq -o SRR14338443_1.fq.gz -O SRR14338443_2.fq.gz
fastp -w 16 -i SRR14338444_1.fastq -I SRR14338444_2.fastq -o SRR14338444_1.fq.gz -O SRR14338444_2.fq.gz
# build hisat2 index with 30 threads
hisat2-build mm10.fa -p 30 mm10.fa
# align short illumina reads agains mm10, using 30 threads
hisat2 -x hg38 -p 30 -1 SRR14338441_1.fastq.gz -2 SRR14338441_2.fastq.gz | samtools view -bS - > SRR14338441.bam
hisat2 -x hg38 -p 30 -1 SRR14338442_1.fastq.gz -2 SRR14338442_2.fastq.gz | samtools view -bS - > SRR14338442.bam
hisat2 -x hg38 -p 30 -1 SRR14338443_1.fastq.gz -2 SRR14338443_2.fastq.gz | samtools view -bS - > SRR14338443.bam
hisat2 -x hg38 -p 30 -1 SRR14338444_1.fastq.gz -2 SRR14338444_2.fastq.gz | samtools view -bS - > SRR14338444.bam
########################
### PacBio alignment ###
########################
minimap2 -ax splice hg38.fa SRR12933206.fastq -t 30 | samtools view -bS - > SRR12933206.bam
#################################
### Merge illumina alignments ###
#################################
samtools merge illumina-merged.bam SRR14338441.bam SRR14338442.bam SRR14338443.bam SRR14338444.bam -@ 40 -f
samtools sort -o illumina-merged.sorted.bam illumina-merged.bam -@ 40
samtools index illumina-merged.sorted.bam
rm illumina-merged.bam
############################################
### Merge PacBio and illumina alignments ###
############################################
samtools merge merged.bam SRR12933206.bam illumina-merged.sorted.bam -@ 50 -f
samtools sort -o merged.sorted.bam merged.bam -@ 50
samtools index merged.sorted.bam
rm merged.bam
######################
### GTF generation ###
######################
# Obtaining GTF (transcripts.gtf) from the above alignment using StringTie
stringtie -p 1 -v -a 4 -o transcripts.gtf merged.sorted.bam
# Obtaining GTF (transcripts.gtf) from the above alignment using StringTie (Guided)
stringtie -p 1 -v -a 4 -G hg38_ncbiRefSeq.gtf -o transcripts-guided.gtf merged.sorted.bam
####################################
### annotate_my_genomes pipeline ###
####################################
### annotate_my_genomes (PacBio + Illumina)
conda activate annotate_my_genomes
# set gawn_config.sh with 30 threads
mkdir output1
begin=`date +%s`
add-ncbi-annotation -a transcripts.gtf -n hg38_ncbiRefSeq.gtf -r hg38.gtf -g hg38.fa -t 30 -c gawn_config.sh -o output1
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
### annotate_my_genomes (PacBio + Illumina, guided)
conda activate annotate_my_genomes
# set gawn_config.sh with 30 threads
mkdir output2
begin=`date +%s`
add-ncbi-annotation -a transcripts-guided.gtf -n hg38_ncbiRefSeq.gtf -r hg38.gtf -g hg38.fa -t 30 -c gawn_config.sh -o output2
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
########################
### BRAKER2 pipeline ###
########################
# Downloading Homo sapiens proteome 9606
conda activate annotate_my_genomes
perl ./annotate_my_genomes/additional_scripts/download_proteome_uniprot.pl 9606
conda activate braker2
# see and fix this issue with diamond: https://github.com/Gaius-Augustus/BRAKER/issues/430
cd /home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/dependencies/
# downloading diamond tool
wget http://github.com/bbuchfink/diamond/releases/download/v2.0.13/diamond-linux64.tar.gz
gunzip diamond-linux64.tar.gz
tar -xvf diamond-linux64.tar
# RUN
begin=`date +%s`
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
export PROTHINT_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/bin/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
echo $PROTHINT_PATH
rm -r -f /home/wslab/braker2_install/BRAKER/config/species/Homo_sapiens
braker.pl --verbosity=4 --AUGUSTUS_ab_initio --cores 30 --etpmode --genome=hg38.fa --species=Homo_sapiens --prot_seq=9606.fasta --bam=merged.sorted.bam
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
# Directory
cd PacBio_Danio_rerio
# Genome (download with genome-download program from annotate_my_genomes)
git clone https://github.com/cfarkas/annotate_my_genomes.git # clone repository, install if necessary.
conda activate annotate_my_genomes
genome-download danRer11
###########################
### Illumina alignments ###
###########################
# fastp install
conda install -c bioconda fastp
# Trimming of illumina fastq datasets
fastp -w 16 -i SRR5865383_1.fastq -I SRR5865383_2.fastq -o SRR5865383_1.fq.gz -O SRR5865383_2.fq.gz
fastp -w 16 -i SRR5865384_1.fastq -I SRR5865384_2.fastq -o SRR5865384_1.fq.gz -O SRR5865384_2.fq.gz
# align short illumina reads agains danRer11, using 30 threads
hisat2 -x danRer11 -p 30 -1 SRR5865383_1.fq.gz -2 SRR5865383_2.fq.gz | samtools view -bS - > SRR5865383.bam
hisat2 -x danRer11 -p 30 -1 SRR5865384_1.fq.gz -2 SRR5865384_2.fq.gz | samtools view -bS - > SRR5865384.bam
#########################
### PacBio alignments ###
#########################
minimap2 -ax splice danRer11.fa SRR5865381.fastq.gz -t 30 | samtools view -bS - > SRR5865381.bam
minimap2 -ax splice danRer11.fa SRR5865382.fastq.gz -t 30 | samtools view -bS - > SRR5865382.bam
#################################
### Merge illumina alignments ###
#################################
samtools merge illumina-merged.bam SRR5865383.bam SRR5865384.bam -@ 30 -f
samtools sort -o illumina-merged.sorted.bam illumina-merged.bam -@ 30
samtools index illumina-merged.sorted.bam
rm merged.bam
############################################
### Merge PacBio and illumina alignments ###
############################################
samtools merge merged.bam SRR5865383.bam SRR5865384.bam SRR5865381.bam SRR5865382.bam -@ 30 -f
samtools sort -o merged.sorted.bam merged.bam -@ 40
samtools index merged.sorted.bam
rm merged.bam
######################
### GTF generation ###
######################
# Obtaining GTF (transcripts.gtf) from the above alignment using StringTie
stringtie -p 1 -v -a 4 -o transcripts.gtf merged.sorted.bam
# Obtaining GTF (transcripts.gtf) from the above alignment using StringTie (Guided)
stringtie -p 1 -v -a 4 -G danRer11_ncbiRefSeq.gtf -o transcripts-guided.gtf merged.sorted.bam
####################################
### annotate_my_genomes pipeline ###
####################################
conda activate annotate_my_genomes
# set gawn_config.sh with 30 threads
mkdir output1
begin=`date +%s`
add-ncbi-annotation -a transcripts.gtf -n danRer11_ncbiRefSeq.gtf -r danRer11.gtf -g danRer11.fa -t 30 -c gawn_config.sh -o output1
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
#### annotate_my_genomes (PacBio + Illumina, guided)
conda activate annotate_my_genomes
# set gawn_config.sh with 30 threads
mkdir output2
begin=`date +%s`
add-ncbi-annotation -a transcripts-guided.gtf -n danRer11_ncbiRefSeq.gtf -r danRer11.gtf -g danRer11.fa -t 30 -c gawn_config.sh -o output2
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
########################
### BRAKER2 pipeline ###
########################
# Downloading Danio rerio proteome 7955
conda activate annotate_my_genomes
perl ./annotate_my_genomes/additional_scripts/download_proteome_uniprot.pl 7955
conda activate braker2
# see and fix this issue with diamond: https://github.com/Gaius-Augustus/BRAKER/issues/430
cd /home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/dependencies/
# downloading diamond tool
wget http://github.com/bbuchfink/diamond/releases/download/v2.0.13/diamond-linux64.tar.gz
gunzip diamond-linux64.tar.gz
tar -xvf diamond-linux64.tar
# RUN
begin=`date +%s`
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
export PROTHINT_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/bin/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
echo $PROTHINT_PATH
rm -r -f /home/wslab/braker2_install/BRAKER/config/species/Danio_rerio
braker.pl --verbosity=4 --AUGUSTUS_ab_initio --cores 30 --etpmode --genome=danRer11.fa --species=Danio_renio --prot_seq=7955.fasta --bam=merged.sorted.bam
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
# Directory
cd PacBio_C_elegans
# Genome (download with genome-download program from annotate_my_genomes)
git clone https://github.com/cfarkas/annotate_my_genomes.git # clone repository, install if necessary.
conda activate annotate_my_genomes
genome-download ce11
#########################
### PacBio alignments ###
#########################
minimap2 -ax splice ce11.fa L1_rep1.fastq.gz -t 30 | samtools view -bS - > L1_rep1.bam
minimap2 -ax splice ce11.fa L1_rep2.fastq.gz -t 30 | samtools view -bS - > L1_rep2.bam
minimap2 -ax splice ce11.fa L2_rep1.fastq.gz -t 30 | samtools view -bS - > L2_rep1.bam
minimap2 -ax splice ce11.fa L2_rep2.fastq.gz -t 30 | samtools view -bS - > L2_rep2.bam
minimap2 -ax splice ce11.fa L3_rep1.fastq.gz -t 30 | samtools view -bS - > L3_rep1.bam
minimap2 -ax splice ce11.fa L3_rep2.fastq.gz -t 30 | samtools view -bS - > L3_rep2.bam
minimap2 -ax splice ce11.fa L4_rep1.fastq.gz -t 30 | samtools view -bS - > L4_rep1.bam
minimap2 -ax splice ce11.fa L4_rep2.fastq.gz -t 30 | samtools view -bS - > L4_rep2.bam
###############################
### Merge PacBio alignments ###
###############################
samtools merge merged.bam L1_rep1.bam L1_rep2.bam L2_rep1.bam L2_rep2.bam L3_rep1.bam L3_rep2.bam L4_rep1.bam L4_rep2.bam -@ 30 -f
samtools sort -o merged.sorted.bam merged.bam -@ 30
samtools index merged.sorted.bam
rm merged.bam
######################
### GTF generation ###
######################
# Obtaining GTF (transcripts.gtf) from the above alignment using StringTie
stringtie -p 1 -v -a 4 -o transcripts.gtf merged.sorted.bam
# Obtaining GTF (transcripts.gtf) from the above alignment using StringTie (Guided)
stringtie -p 1 -v -a 4 -G ce11_ncbiRefSeq.gtf -o transcripts-guided.gtf merged.sorted.bam
####################################
### annotate_my_genomes pipeline ###
####################################
#### annotate_my_genomes (PacBio + Illumina)
conda activate annotate_my_genomes
# set gawn_config.sh with 30 threads
mkdir output1
begin=`date +%s`
add-ncbi-annotation -a transcripts.gtf -n ce11_ncbiRefSeq.gtf -r ce11.gtf -g ce11.fa -t 30 -c gawn_config.sh -o output1
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
#### annotate_my_genomes (PacBio + Illumina, guided)
conda activate annotate_my_genomes
# set gawn_config.sh with 30 threads
mkdir output2
begin=`date +%s`
add-ncbi-annotation -a transcripts-guided.gtf -n ce11_ncbiRefSeq.gtf -r ce11.gtf -g ce11.fa -t 30 -c gawn_config.sh -o output2
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
########################
### BRAKER2 pipeline ###
########################
# Downloading Caenorhabditis elegans proteome 6239
conda activate annotate_my_genomes
perl ./annotate_my_genomes/additional_scripts/download_proteome_uniprot.pl 6239
conda activate braker2
# see and fix this issue with diamond: https://github.com/Gaius-Augustus/BRAKER/issues/430
cd /home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/dependencies/
# downloading diamond tool
wget http://github.com/bbuchfink/diamond/releases/download/v2.0.13/diamond-linux64.tar.gz
gunzip diamond-linux64.tar.gz
tar -xvf diamond-linux64.tar
# RUN
begin=`date +%s`
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
export PROTHINT_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/bin/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
echo $PROTHINT_PATH
rm -r -f /home/wslab/braker2_install/BRAKER/config/species/Caenorhabditis_elegans
braker.pl --verbosity=4 --AUGUSTUS_ab_initio --cores 30 --etpmode --genome=ce11.fa --species=Caenorhabditis_elegans --prot_seq=6239.fasta --bam=merged.sorted.bam
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
We benchmarked annotated GTF assemblies from each pipeline using Gffcompare, as follows:
cd PacBio_galGal6
### Test1 BRAKER2
gffcompare -R -r galGal6_ncbiRefSeq.gtf -s galGal6.fa -o test1_braker2 ./test1_braker2/braker/braker.gtf
### Test2 BRAKER2
gffcompare -R -r galGal6_ncbiRefSeq.gtf -s galGal6.fa -o test2_braker2 ./test2_braker2/braker/braker.gtf
### Test3 BRAKER2
gffcompare -R -r galGal6_ncbiRefSeq.gtf -s galGal6.fa -o test3_braker2 ./test3_braker2/braker/braker.gtf
### Test4 BRAKER2
gffcompare -R -r galGal6_ncbiRefSeq.gtf -s galGal6.fa -o test4_braker2 ./test4_braker2/braker/braker.gtf
### annotate_my_genomes (illumina + PacBio)
gffcompare -R -r galGal6_ncbiRefSeq.gtf -s galGal6.fa -o annotate_my_genomes_PacBio+illumina ./output1/output_files/
### annotate_my_genomes (illumina + PacBio, guided)
gffcompare -R -r galGal6_ncbiRefSeq.gtf -s galGal6.fa -o annotate_my_genomes_PacBio+illumina_guided ./output2/output_files/
### annotate_my_genomes (illumina alone)
gffcompare -R -r galGal6_ncbiRefSeq.gtf -s galGal6.fa -o annotate_my_genomes_illumina_alone ./output3/output_files/
### Ab initio AUGUSTUS predictions from UCSC (as GTF)
# info : https://genome.ucsc.edu/cgi-bin/hgTrackUi?db=galGal6&g=augustusGene
wget http://hgdownload.cse.ucsc.edu/goldenpath/galGal6/database/augustusGene.txt.gz
gunzip augustusGene.txt.gz
wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/genePredToGtf
chmod 755 genePredToGtf
cut -f 2- augustusGene.txt | ./genePredToGtf file stdin -source=AUGUSTUS AUGUSTUS.gtf
gffcompare -R -r galGal6_ncbiRefSeq.gtf -s galGal6.fa -o AUGUSTUS_compare AUGUSTUS.gtf
cd PacBio_Mus_musculus
### BRAKER2
gffcompare -R -r mm10_ncbiRefSeq.gtf -s mm10.fa -o braker_compare ./braker/braker.gtf
### annotate_my_genomes (illumina + PacBio)
gffcompare -R -r mm10_ncbiRefSeq.gtf -s mm10.fa -o annotate_my_genomes_PacBio+illumina ./output1/output_files/
### annotate_my_genomes (illumina + PacBio, guided)
gffcompare -R -r mm10_ncbiRefSeq.gtf -s mm10.fa -o annotate_my_genomes_PacBio+illumina_guided ./output2/output_files/
### Ab initio AUGUSTUS predictions from UCSC (as GTF)
# info : https://genome.ucsc.edu/cgi-bin/hgTrackUi?db=mm10&g=augustusGene
wget http://hgdownload.cse.ucsc.edu/goldenpath/mm10/database/augustusGene.txt.gz
gunzip augustusGene.txt.gz
wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/genePredToGtf
chmod 755 genePredToGtf
cut -f 2- augustusGene.txt | ./genePredToGtf file stdin -source=AUGUSTUS AUGUSTUS.gtf
gffcompare -R -r mm10_ncbiRefSeq.gtf -s mm10.fa -o AUGUSTUS_compare AUGUSTUS.gtf
cd PacBio_Homo_sapiens
### BRAKER2
gffcompare -R -r hg38_ncbiRefSeq.gtf -s hg38.fa -o braker_compare ./braker/braker.gtf
### annotate_my_genomes (illumina + PacBio)
gffcompare -R -r hg38_ncbiRefSeq.gtf -s hg38.fa -o annotate_my_genomes_PacBio+illumina ./output1/output_files/
### annotate_my_genomes (illumina + PacBio, guided)
gffcompare -R -r hg38_ncbiRefSeq.gtf -s hg38.fa -o annotate_my_genomes_PacBio+illumina_guided ./output2/output_files/
### Ab initio AUGUSTUS predictions from UCSC (as GTF)
# info : https://genome.ucsc.edu/cgi-bin/hgTrackUi?db=hg38&g=augustusGene
wget http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/augustusGene.txt.gz
gunzip augustusGene.txt.gz
wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/genePredToGtf
chmod 755 genePredToGtf
cut -f 2- augustusGene.txt | ./genePredToGtf file stdin -source=AUGUSTUS AUGUSTUS.gtf
gffcompare -R -r hg38_ncbiRefSeq.gtf -s hg38.fa -o AUGUSTUS_compare AUGUSTUS.gtf
cd PacBio_Danio_rerio
### BRAKER2
gffcompare -R -r danRer11_ncbiRefSeq.gtf -s danRer11.fa -o braker_compare ./braker/braker.gtf
### annotate_my_genomes (illumina + PacBio)
gffcompare -R -r danRer11_ncbiRefSeq.gtf -s danRer11.fa -o annotate_my_genomes_PacBio+illumina ./output1/output_files/
### annotate_my_genomes (illumina + PacBio, guided)
gffcompare -R -r danRer11_ncbiRefSeq.gtf -s danRer11.fa -o annotate_my_genomes_PacBio+illumina_guided ./output2/output_files/
### Ab initio AUGUSTUS predictions from UCSC (as GTF)
# info : https://genome.ucsc.edu/cgi-bin/hgTrackUi?db=danRer11&g=augustusGene
wget http://hgdownload.cse.ucsc.edu/goldenpath/danRer11/database/augustusGene.txt.gz
gunzip augustusGene.txt.gz
wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/genePredToGtf
chmod 755 genePredToGtf
cut -f 2- augustusGene.txt | ./genePredToGtf file stdin -source=AUGUSTUS AUGUSTUS.gtf
gffcompare -R -r danRer11_ncbiRefSeq.gtf -s danRer11.fa -o AUGUSTUS_compare AUGUSTUS.gtf
cd PacBio_C_elegans
### BRAKER2
gffcompare -R -r ce11_ncbiRefSeq.gtf -s ce11.fa -o braker_compare ./braker/braker.gtf
### annotate_my_genomes (illumina + PacBio)
gffcompare -R -r ce11_ncbiRefSeq.gtf -s ce11.fa -o annotate_my_genomes_PacBio+illumina ./output1/output_files/
### annotate_my_genomes (illumina + PacBio, guided)
gffcompare -R -r ce11_ncbiRefSeq.gtf -s ce11.fa -o annotate_my_genomes_PacBio+illumina_guided ./output2/output_files/
### Ab initio AUGUSTUS predictions from UCSC (as GTF)
# info : https://genome.ucsc.edu/cgi-bin/hgTrackUi?db=ce11&g=augustusGene
wget http://hgdownload.cse.ucsc.edu/goldenpath/ce11/database/augustusGene.txt.gz
gunzip augustusGene.txt.gz
wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/genePredToGtf
chmod 755 genePredToGtf
cut -f 2- augustusGene.txt | ./genePredToGtf file stdin -source=AUGUSTUS AUGUSTUS.gtf
gffcompare -R -r ce11_ncbiRefSeq.gtf -s ce11.fa -o AUGUSTUS_compare AUGUSTUS.gtf
We also benchmarked TSEBRA protocol using long reads, as described here: https://github.com/Gaius-Augustus/BRAKER/blob/master/docs/long_reads/long_read_protocol.md
cd PacBio_galGal6
### short read mapping (merge) : Illumina_merged.sorted.bam # from previous steps
### BRAKER1 run
conda activate braker2
begin=`date +%s`
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
rm -r -f /home/wslab/mambaforge/envs/braker2/config//species/Gallus_gallus
braker.pl --genome=/home/wslab/test1_annotate/galGal6.fa --softmasking --cores 30 --bam=Illumina_merged.sorted.bam --workingdir=./braker1/ 2> ./braker1.log
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
### BRAKER2 run
conda activate braker2
begin=`date +%s`
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
export PROTHINT_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/bin/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
echo $PROTHINT_PATH
braker.pl --genome=/home/wslab/test1_annotate/galGal6.fa --softmasking --cores=30 --epmode --prot_seq=9031.fasta --workingdir=./braker2/ 2> ./braker2.log
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
### Long read transcriptome processing
conda activate braker2
# 1. mapping polished IsoSeq reads
minimap2 -ax splice galGal6.fa all_seqs.fasta > PacBio.merged.sam -t 30
samtools view -S -b PacBio.merged.sam > PacBio.merged.bam --threads 30
samtools sort PacBio.merged.bam> PacBio.merged.sorted.bam --threads 30 && samtools index PacBio.merged.sorted.bam -@ 30
# 2. GTF generation with stringtie
stringtie -p 1 -j 1 -c 0.001 -s 1 -v -a 1 -o PacBio.merged.gtf PacBio.merged.sorted.bam
# 3. Obtaining assembled transcripts with gffread
gffread -w assembled-transcripts.fasta -g galGal6.fa PacBio.merged.gtf
# 4. Getting open read frames (ORF) from the transcript sequences with GeneMarkS GMST
perl /home/wslab/BRAKER/GMST/gmst.pl --strand direct assembled-transcripts.fasta --output gmst.out --format GFF
# 5. Use the GeneMarkS-T coordinates and the long-read transcripts to create a gene set in GTF format.
python /home/wslab/braker2_install/BRAKER/scripts/gmst2globalCoords.py -t PacBio.merged.gtf -p gmst.out -o gmst.global.gtf -g galGal6.fa
### TSEBRA GTF file
conda activate braker2
python /home/wslab/TSEBRA/bin/tsebra.py -g ./braker1/augustus.hints.gtf,./braker2/augustus.hints.gtf -e ./braker1/hintsfile.gff,./braker2/hintsfile.gff -l gmst.global.gtf -c /home/wslab/TSEBRA/config/long_reads.cfg -o tsebra.gtf
# gffcompare tsebra
cd PacBio_galGal6/
gffcompare -R -r galGal6_ncbiRefSeq.gtf -s galGal6.fa -o tsebra_compare tsebra.gtf
mkdir gffcompare_tsebra
mv tsebra_compare* ./gffcompare_tsebra
cd PacBio_Mus_musculus
### short read mapping:
samtools merge illumina-merged.bam SRR10266994.bam SRR10266993.bam SRR10266992.bam -@ 30 -f
samtools sort -o illumina-merged.sorted.bam illumina-merged.bam -@ 30
samtools index illumina-merged.sorted.bam
rm illumina-merged.bam
### BRAKER1 run
conda activate braker2
begin=`date +%s`
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
rm -r -f /home/wslab/mambaforge/envs/braker2/config//species/Mus_musculus
braker.pl --genome=mm10.fa --softmasking --cores 30 --bam=illumina-merged.sorted.bam --workingdir=./braker1/ 2> ./braker1.log
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
### BRAKER2 run
conda activate braker2
begin=`date +%s`
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
export PROTHINT_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/bin/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
echo $PROTHINT_PATH
braker.pl --genome=mm10.fa --softmasking --cores=30 --epmode --prot_seq=10090.fasta --workingdir=./braker2/ 2> ./braker2.log
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
### Long read transcriptome processing
conda activate braker2
# 1. mapping polished IsoSeq reads
minimap2 -ax splice mm10.fa polished.fasta > PacBio.sam -t 30
samtools view -S -b PacBio.sam > PacBio.bam
samtools sort PacBio.bam > PacBio.sorted.bam --threads 30 && samtools index PacBio.sorted.bam -@ 30
# 2. GTF generation with stringtie
stringtie -p 1 -j 1 -c 0.001 -s 1 -v -a 1 -o PacBio.gtf PacBio.sorted.bam
# 3. Obtaining assembled transcripts with gffread
gffread -w assembled-transcripts.fasta -g mm10.fa PacBio.gtf
# 4. Getting open read frames (ORF) from the transcript sequences with GeneMarkS GMST
perl /home/wslab/BRAKER/GMST/gmst.pl --strand direct assembled-transcripts.fasta --output gmst.out --format GFF
# 5. Use the GeneMarkS-T coordinates and the long-read transcripts to create a gene set in GTF format.
python /home/wslab/braker2_install/BRAKER/scripts/gmst2globalCoords.py -t PacBio.gtf -p gmst.out -o gmst.global.gtf -g mm10.fa
### TSEBRA GTF file
conda activate braker2
python /home/wslab/TSEBRA/bin/tsebra.py -g ./braker1/augustus.hints.gtf,./braker2/augustus.hints.gtf -e ./braker1/hintsfile.gff,./braker2/hintsfile.gff -l gmst.global.gtf -c /home/wslab/TSEBRA/config/long_reads.cfg -o tsebra.gtf
# gffcompare tsebra
cd PacBio_Mus_musculus/
gffcompare -R -r mm10_ncbiRefSeq.gtf -s mm10.fa -o tsebra_compare tsebra.gtf
mkdir gffcompare_tsebra
mv tsebra_compare* ./gffcompare_tsebra
cd PacBio_Homo_sapiens
# BRAKER1 run
conda activate braker2
begin=`date +%s`
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
rm -r -f /home/wslab/mambaforge/envs/braker2/config//species/Homo_sapiens
braker.pl --genome=hg38.fa --softmasking --cores 30 --bam=illumina-merged.sorted.bam --workingdir=./braker1/ 2> ./braker1.log
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
#
# BRAKER2 run
conda activate braker2
begin=`date +%s`
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
export PROTHINT_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/bin/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
echo $PROTHINT_PATH
braker.pl --genome=hg38.fa --softmasking --cores=30 --epmode --prot_seq=9606.fasta --workingdir=./braker2/ 2> ./braker2.log
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
#
### Long read transcriptome processing
conda activate braker2
# 1. mapping IsoSeq reads:
samtools sort -o SRR12933206.sorted.bam SRR12933206.bam -@ 30
samtools index SRR12933206.sorted.bam
# 2. GTF generation with stringtie
stringtie -p 1 -j 1 -c 0.001 -s 1 -v -a 1 -o PacBio.gtf SRR12933206.sorted.bam
# 3. Obtaining assembled transcripts with gffread
gffread -w assembled-transcripts.fasta -g hg38.fa PacBio.gtf
# 4. Getting open read frames (ORF) from the transcript sequences with GeneMarkS GMST
perl /home/wslab/BRAKER/GMST/gmst.pl --strand direct assembled-transcripts.fasta --output gmst.out --format GFF
# 5. Use the GeneMarkS-T coordinates and the long-read transcripts to create a gene set in GTF format.
python /home/wslab/braker2_install/BRAKER/scripts/gmst2globalCoords.py -t PacBio.gtf -p gmst.out -o gmst.global.gtf -g hg38.fa
### TSEBRA GTF file
conda activate braker2
python /home/wslab/TSEBRA/bin/tsebra.py -g ./braker1/augustus.hints.gtf,./braker2/augustus.hints.gtf -e ./braker1/hintsfile.gff,./braker2/hintsfile.gff -l gmst.global.gtf -c /home/wslab/TSEBRA/config/long_reads.cfg -o tsebra.gtf
# gffcompare tsebra
cd PacBio_Homo_sapiens/
gffcompare -R -r hg38_ncbiRefSeq.gtf -s hg38.fa -o tsebra_compare tsebra.gtf
mkdir gffcompare_tsebra
mv tsebra_compare* ./gffcompare_tsebra
cd PacBio_Danio_rerio
# BRAKER1 run
conda activate braker2
begin=`date +%s`
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
rm -r -f /home/wslab/mambaforge/envs/braker2/config//species/Danio_rerio
braker.pl --genome=danRer11.fa --softmasking --cores 30 --bam=illumina-merged.sorted.bam --workingdir=./braker1/ 2> ./braker1.log
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
#
# BRAKER2 run
conda activate braker2
begin=`date +%s`
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
export PROTHINT_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/bin/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
echo $PROTHINT_PATH
braker.pl --genome=danRer11.fa --softmasking --cores=30 --epmode --prot_seq=7955.fasta --workingdir=./braker2/ 2> ./braker2.log
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
#
### Long read transcriptome processing
conda activate braker2
# 1. mapping IsoSeq reads
minimap2 -ax splice danRer11.fa SRR5865381.fastq.gz -t 30 | samtools view -bS - > SRR5865381.bam
minimap2 -ax splice danRer11.fa SRR5865382.fastq.gz -t 30 | samtools view -bS - > SRR5865382.bam
samtools merge PacBio.merged.bam SRR5865381.bam SRR5865382.bam -@ 30 -f
samtools sort -o PacBio.merged.sorted.bam PacBio.merged.bam -@ 30
samtools index PacBio.merged.sorted.bam
rm PacBio.merged.bam
# 2. GTF generation with stringtie
stringtie -p 1 -j 1 -c 1 -v -a 1 -o PacBio.gtf PacBio.merged.sorted.bam
# 3. Obtaining assembled transcripts with gffread
gffread -w assembled-transcripts.fasta -g danRer11.fa PacBio.gtf
# 4. Getting open read frames (ORF) from the transcript sequences with GeneMarkS GMST
perl /home/wslab/BRAKER/GMST/gmst.pl --strand direct assembled-transcripts.fasta --output gmst.out --format GFF
# 5. Use the GeneMarkS-T coordinates and the long-read transcripts to create a gene set in GTF format.
python /home/wslab/braker2_install/BRAKER/scripts/gmst2globalCoords.py -t PacBio.gtf -p gmst.out -o gmst.global.gtf -g danRer11.fa
### TSEBRA GTF file
conda activate braker2
python /home/wslab/TSEBRA/bin/tsebra.py -g ./braker1/augustus.hints.gtf,./braker2/augustus.hints.gtf -e ./braker1/hintsfile.gff,./braker2/hintsfile.gff -l gmst.global.gtf -c /home/wslab/TSEBRA/config/long_reads.cfg -o tsebra.gtf
# gffcompare tsebra
cd PacBio_Danio_rerio/
gffcompare -R -r hg38_ncbiRefSeq.gtf -s hg38.fa -o tsebra_compare tsebra.gtf
mkdir gffcompare_tsebra
mv tsebra_compare* ./gffcompare_tsebra
cd PacBio_C_elegans
# BRAKER1 run (with long reads)
conda activate braker2
begin=`date +%s`
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
rm -r -f /home/wslab/mambaforge/envs/braker2/config//species/Caenorhabditis_elegans
braker.pl --genome=ce11.fa --softmasking --cores 30 --bam=merged.sorted.bam --workingdir=./braker1/ 2> ./braker1.log
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
#
# BRAKER2 run
conda activate braker2
begin=`date +%s`
export AUGUSTUS_CONFIG_PATH=/home/wslab/braker2_install/BRAKER/config
export GENEMARK_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/
export PROTHINT_PATH=/home/wslab/braker2_install/BRAKER/gmes_linux_64_4/ProtHint/bin/
echo $AUGUSTUS_CONFIG_PATH
echo $GENEMARK_PATH
echo $PROTHINT_PATH
braker.pl --genome=ce11.fa --softmasking --cores=30 --epmode --prot_seq=6239.fasta --workingdir=./braker2/ 2> ./braker2.log
end=`date +%s`
elapsed=`expr $end - $begin`
echo Time taken: $elapsed
#
### Long read transcriptome processing
conda activate braker2
# 1. mapping IsoSeq reads: merged.sorted.bam, already mapped.
# 2. GTF generation with stringtie
stringtie -p 1 -j 1 -c 0.001 -s 1 -v -a 1 -o PacBio.merged.gtf merged.sorted.bam
# 3. Obtaining assembled transcripts with gffread
gffread -w assembled-transcripts.fasta -g ce11.fa PacBio.merged.gtf
# 4. Getting open read frames (ORF) from the transcript sequences with GeneMarkS GMST
perl /home/wslab/BRAKER/GMST/gmst.pl --strand direct assembled-transcripts.fasta --output gmst.out --format GFF
# 5. Use the GeneMarkS-T coordinates and the long-read transcripts to create a gene set in GTF format.
python /home/wslab/braker2_install/BRAKER/scripts/gmst2globalCoords.py -t PacBio.merged.gtf -p gmst.out -o gmst.global.gtf -g ce11.fa
### TSEBRA GTF file
conda activate braker2
python /home/wslab/TSEBRA/bin/tsebra.py -g ./braker1/augustus.hints.gtf,./braker2/augustus.hints.gtf -e ./braker1/hintsfile.gff,./braker2/hintsfile.gff -l gmst.global.gtf -c /home/wslab/TSEBRA/config/long_reads.cfg -o tsebra.gtf
# gffcompare tsebra
cd PacBio_C_elegans/
gffcompare -R -r ce11_ncbiRefSeq.gtf -s ce11.fa -o tsebra_compare tsebra.gtf
mkdir gffcompare_tsebra
mv tsebra_compare* ./gffcompare_tsebra
We simulated PacBio and Illumina data using ReSeq (https://github.com/schmeing/ReSeq) and IsoSeqSim tools (https://github.com/yunhaowang/IsoSeqSim), respectively.
At January 2022 on Ubuntu 21.04 :
# debian Requirements:
sudo apt install build-essential
sudo apt-get install zlib1g-dev
sudo apt-get install libbz2-dev
sudo apt-get install python3-dev
sudo apt-get install git
sudo apt-get install cmake
sudo apt-get install swig
# boost : https://stackoverflow.com/questions/12578499/how-to-install-boost-on-ubuntu
wget -O boost_1_79_0.tar.gz https://sourceforge.net/projects/boost/files/boost/1.79.0/boost_1_79_0.tar.gz/download
tar xzvf boost_1_79_0.tar.gz
cd boost_1_79_0/
sudo apt-get update
sudo apt-get install build-essential g++ python-dev autotools-dev libicu-dev libbz2-dev libboost-all-dev
./bootstrap.sh --prefix=/usr/local/bin/
./b2
The Boost C++ Libraries were successfully built!
The following directory should be added to compiler include paths:
/mnt/05b41c36-2c2a-47ff-8539-154dec9a2e7d/boost_1_79_0
The following directory should be added to linker library paths:
/mnt/05b41c36-2c2a-47ff-8539-154dec9a2e7d/boost_1_79_0/stage/lib
# ReSeq
git clone https://github.com/schmeing/ReSeq.git
cd ReSeq
mkdir build
cd build
cmake ..
make
sudo cp ./bin/* /usr/local/bin/
git clone https://github.com/yunhaowang/IsoSeqSim.git
# binaries are located here:
cd /mnt/05b41c36-2c2a-47ff-8539-154dec9a2e7d/IsoSeqSim/bin/isoseqsim/
# Inputs:
- Reference annotation: galGal6_ncbiRefSeq.gtf
# Converting GTF to GPD format
python2 ./IsoSeqSim/utilities/py_isoseqsim_gtf2gpd.py -i galGal6_ncbiRefSeq.gtf -o galGal6_ncbiRefSeq.gpd
# Run IsoSeqSim:
python2 ./IsoSeqSim/bin/isoseqsim.py -g galGal6.fa -a galGal6_ncbiRefSeq.gtf \
--c5 ./IsoSeqSim/utilities/5_end_completeness.PacBio-P6-C4.tab \
--c3 ./IsoSeqSim/utilities/3_end_completeness.PacBio-P6-C4.tab \
--nbn 20 \
-o isoseqsim/simulated_reads_normal.fa -t galGal6_ncbiRefSeq.gpd --tempdir isoseqsim/temp_normal --cpu 30
1) Map real illumina RNA-seq reads against reference transcriptome:
# align short illumina reads agains NCBI_transcripts, using 30 threads
bowtie2 -p 50 -X 2000 -x NCBI_transcripts -1 41-A3_S1_R1_001.fastq.gz -2 41-A3_S1_R2_001.fastq.gz | samtools view -bS - > 41.bam
bowtie2 -p 50 -X 2000 -x NCBI_transcripts -1 41-B3_S2_R1_001.fastq.gz -2 41-B3_S2_R2_001.fastq.gz | samtools view -bS - > 42.bam
bowtie2 -p 50 -X 2000 -x NCBI_transcripts -1 71-A4_S3_R1_001.fastq.gz -2 71-A4_S3_R2_001.fastq.gz | samtools view -bS - > 71.bam
bowtie2 -p 50 -X 2000 -x NCBI_transcripts -1 71-B4_S4_R1_001.fastq.gz -2 71-B4_S4_R2_001.fastq.gz | samtools view -bS - > 72.bam
# sorting bam files
samtools sort 41.bam -@ 50 > 41.sorted.bam
samtools sort 42.bam -@ 50 > 42.sorted.bam
samtools sort 71.bam -@ 50 > 71.sorted.bam
samtools sort 72.bam -@ 50 > 72.sorted.bam
# index bam files
samtools index 41.sorted.bam -@ 15 && rm 41.bam
samtools index 42.sorted.bam -@ 15 && rm 42.bam
samtools index 71.sorted.bam -@ 15 && rm 71.bam
samtools index 72.sorted.bam -@ 15 && rm 72.bam
# Merge illumina bam files
samtools merge Illumina_merged.bam 41.sorted.bam 42.sorted.bam 71.sorted.bam 72.sorted.bam -@ 50 -f
samtools sort -o Illumina_merged.sorted.bam Illumina_merged.bam -@ 50
samtools index Illumina_merged.sorted.bam -@ 50
rm Illumina_merged.bam
2) Run ReSeq:
## Inputs:
- Reference transcriptome: NCBI_transcripts.fa
- Real Illumina mapped reads: Illumina_merged.sorted.bam
- Paired real Illumina RNA-seq reads
reseq illuminaPE -j 30 -c 20 -r NCBI_transcripts.fa -b Illumina_merged.sorted.bam -1 illumina_simulated_data_1.fq -2 illumina_simulated_data_2.fq
pigz -p 30 illumina_simulated_data_*.fq
Benchmarking on simulations were carried out as done with real datasets as above.