Skip to content

Anegin24/Shotgun-metagenomics

Repository files navigation

Shotgun metagenomic workflow (Long (ONT)&short read)

image

1. Setup

1.1. Install Miniforge Download and install Miniforge (a minimal conda installer):

wget --no-check-certificate https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh
bash Miniforge3-Linux-x86_64.sh
conda init
source ~/.bashrc

1.2. Install dependencies long_read_shotgun environment

# Create a new environment named bio-env
mamba create --name long_read_shotgun -y

# Activate the environment
source activate long_read_shotgun

# Install packages from Bioconda
mamba install -c bioconda sra-tools
mamba install -c bioconda fastp porechop bowtie2 samtools minimap2 fastqc nanoplot quast spades flye pandas kraken2 bracken

Set up the environment for binning.

mamba create --name binning python=2.7.15 \
    metawrap \
    biopython=1.68 \
    blast=2.6.0 \
    bmtagger=3.101 \
    bowtie2=2.3.5 \
    bwa=0.7.17 \
    checkm-genome=1.0.12 \
    fastqc=0.11.8 \
    kraken=1.1 \
    kraken2=2.0 \
    krona=2.7 \
    matplotlib-base=2.2.3 \
    maxbin2=2.2.6 \
    metabat2=2.12.1 \
    pandas=0.24.2 \
    perl-bioperl \
    pplacer=1.1.alpha19 \
    prokka=1 \
    quast=5.0.2 \
    r-ggplot2=3.1.0 \
    r-reshape2 \
    r-recommended=3.5.1 \
    samtools=1.9 \
    seaborn=0.9.0 \
    spades=3.13.0 \
    trim-galore=0.5.0

Set up the environment for checkm

# Create and activate the conda environment
mamba create -n checkm2 python=3.8 -y
mamba activate checkm2

# Install CheckM2
mamba install -c bioconda checkm2 -y

# Create directory for CheckM2 database
mkdir -p checkm2_db
cd checkm2_db

# Download the CheckM2 database
checkm2 database --download --path .

# Set the CHECKM2DB environment variable
export CHECKM2DB="path/to/checkm2_db"

2. Download Dataset

Generate SraAccList.txt

SRR18491298
SRR18491084
SRR18491050
SRR18490941
SRR18490946
SRR18491056
SRR18491085
SRR18490950
SRR18491259
SRR18490980
SRR18491247
SRR18490968
SRR18491039
SRR18491329
SRR18490989
SRR18491307
SRR18491312
SRR18490994
SRR18491000
SRR18491318
SRR18491323
SRR18491005
SRR18491040
SRR18491330
SRR18491337
SRR18491047

Generate sample-metadata.csv

Sample-id	name	type
SRR18490938	TD78	Illumina_shortread
SRR18490939	CD35	Illumina_shortread
SRR18490940	TD40	ONT_longread
SRR18490941	TD39	ONT_longread
SRR18490946	TD34	ONT_longread
SRR18490950	CD34	Illumina_shortread
SRR18490968	CD90	ONT_longread
SRR18490980	CD79	ONT_longread
SRR18490989	TD70	Illumina_shortread
SRR18490994	TD65	Illumina_shortread
SRR18491000	TD60	Illumina_shortread
SRR18491005	TD55	Illumina_shortread
SRR18491039	TD50	Illumina_shortread
SRR18491040	TD49	Illumina_shortread
SRR18491047	TD42	Illumina_shortread
SRR18491050	TD40	Illumina_shortread
SRR18491056	TD34	Illumina_shortread
SRR18491084	CD35	ONT_longread
SRR18491085	CD34	ONT_longread
SRR18491247	CD90	Illumina_shortread
SRR18491259	CD79	Illumina_shortread
SRR18491298	TD78	ONT_longread
SRR18491307	TD70	ONT_longread
SRR18491312	TD65	ONT_longread
SRR18491318	TD60	ONT_longread
SRR18491323	TD55	ONT_longread
SRR18491329	TD50	ONT_longread
SRR18491330	TD49	ONT_longread
SRR18491337	TD42	ONT_longread
SRR18491051	TD39	Illumina_shortread

Download raw data

prefetch --option-file SraAccList.txt

Moving data & generate fastq

#!/bin/bash
inputdirectory="/media/anegin97/DATA/DATA/Metagenomic/LongShortRead/"
# Create the target directories
mkdir -p "$inputdirectory/long/fastqlong" "$inputdirectory/short/fastqshort"
# Loop through each line in the CSV file
while IFS=',' read -r sample_id name type; do
    # Skip the header row
    if [[ $sample_id == "Sample-id" ]]; then
        continue
    fi

    # Determine the target directory based on type
    if [[ $type == "ONT_longread" ]]; then
        target_dir="long"
    elif [[ $type == "Illumina_shortread" ]]; then
        target_dir="short"
    else
        continue
    fi

    # Move the folder
    folder_path="./$sample_id"  # Assuming folders are named by Sample-id
    if [[ -d $folder_path ]]; then
        mv "$folder_path" "$inputdirectory/$target_dir/"
        echo "Moved folder $folder_path to $inputdirectory/$target_dir"
    else
        echo "Folder $folder_path not found"
    fi

    # Process with fastq-dump after moving the folder
    if [[ $type == "ONT_longread" ]]; then
        fastq-dump --gzip "$inputdirectory/$target_dir/$sample_id/*" -O "$inputdirectory/long/fastqlong"
    elif [[ $type == "Illumina_shortread" ]]; then
        fastq-dump --split-files --gzip "$inputdirectory/$target_dir/$sample_id/*" -O "$inputdirectory/short/fastqshort"
    fi
done < sample-metadata.csv

3. Download Database

3.1. Human genome reference Download fasta

wget https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/latest/hg38.fa.gz
gunzip hg38.fa.gz

Indexing bowtie2

# Activate the environment
source activate long_read_shotgun

mkdir host_index
cd host_index
bowtie2-build --threads 20 ../hg38.fa "bt2_index_base"

3.2. Taxonomic Classification Download Kraken tool

wget https://raw.githubusercontent.com/jenniferlu717/KrakenTools/master/kreport2mpa.py
wget https://raw.githubusercontent.com/jenniferlu717/KrakenTools/master/combine_mpa.py

Download Kraken-Standard

https://genome-idx.s3.amazonaws.com/kraken/k2_standard_20240904.tar.gz
tar -xzvf k2_standard_20240904.tar.gz k2_standard

4. Shotgun Metagenomics Workflow

4.1. Quality Control

For short read

bash QC_short.sh

For long read

bash QC_long.sh

4.2. Assembly

For short read

Solution 1

bash Assembly_short.sh

Solution 2: less RAM required

wget http://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh
conda env create -n qiime2-metagenome-2024.10 --file https://data.qiime2.org/distro/metagenome/qiime2-metagenome-2024.10-py310-linux-conda.yml

Import raw data to QIIME2

  echo -e 'sample-id\tforward-absolute-filepath\treverse-absolute-filepath' > manifest.tsv
  for FOR in reads/*_1*gz;
  do ID=$(basename $FOR | cut -f1 -d_);
  REV=${FOR/_1/_2};
  echo -e "${ID}\t${PWD}/${FOR}\t${PWD}/${REV}";
  done >>manifest.tsv
  qiime tools import \
  --type 'SampleData[PairedEndSequencesWithQuality]' \
  --input-path manifest.tsv \
  --output-path reads.qza \
  --input-format PairedEndFastqManifestPhred33V2

Assembly with megahit

qiime assembly assemble-megahit \
    --i-seqs reads/reads.qza \
    --p-presets "meta-sensitive" \
    --p-num-cpu-threads 16 \
    --o-contigs contigs/contigs.qza \
    --verbose

QC with QUAST

qiime assembly evaluate-contigs \
    --i-contigs contigs/contigs.qza \
    --p-threads 16 \
    --p-memory-efficient \
    --o-visualization contigs/contigs.qzv \
    --verbose

Export contigs.qza to tsv format for each sample

qiime tools export \
  --input-path path/contigs.qza \
  --output-path exported-contigs

For long read

bash Assembly_long.sh

5. Taxonomy classification using Kraken2

Classification with Kraken2

bash kraken2.sh

Bracken

bash bracken.sh

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published