From e47a7ae41d52e5981077858c948091fd74b4dc18 Mon Sep 17 00:00:00 2001 From: vinuesa Date: Tue, 24 Oct 2023 23:06:57 -0600 Subject: [PATCH] v1.0. major code rewrite --- compute_blastp_RBH_orthologous_clusters.sh | 503 +++++++++++---------- 1 file changed, 257 insertions(+), 246 deletions(-) diff --git a/compute_blastp_RBH_orthologous_clusters.sh b/compute_blastp_RBH_orthologous_clusters.sh index 43f3f4f..75d7d95 100755 --- a/compute_blastp_RBH_orthologous_clusters.sh +++ b/compute_blastp_RBH_orthologous_clusters.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -#: progname: compute_blastp_RBH_orthologous_clusters.sh +#: progname: compute_RBH_clusters.sh #: Author: Pablo Vinuesa, CCG-UNAM, @pvinmex, https://www.ccg.unam.mx/~vinuesa/ # #: AIM: Simple script around NCBI's blastp, to compute reciprocal best hits between a @@ -42,7 +42,10 @@ #---------------------------------------------------------------------------------------- progname=${0##*/} -vers=0.6_2023-08-26 # improved code modularization and comments. +vers=1.0_2023-10-24 # - Major rewrite and simplification of the RBH filtering code, now using AWK hashes. + # - Major rewrite and simplification of the code to write out clusters + # - Customized and more informative BLAST results table fields + # - higher customizability of BLAST runs min_bash_vers=4.4 # required to write modern bash idioms: # 1. printf '%(%F)T' '-1' in print_start_time; and @@ -52,13 +55,23 @@ min_bash_vers=4.4 # required to write modern bash idioms: set -o pipefail + +# fixed custom blastp header +cols='6 qseqid sseqid pident gaps length qlen slen qcovs evalue score' + # Initialize variables with default values DEBUG=0 -qcov=80 -num_aln=10 +qcov=70 +num_aln=5 threads=4 ext=faa +mat=BLOSUM62 +Eval=0.001 +seg=yes # yes|no +mask=true # true|false +fix_header=0 # do not fix FASTA header for makeblastdb + # Color codes for output RED='\033[0;31m' NC='\033[0m' # No Color => end color @@ -72,6 +85,22 @@ function error() { echo -e "${RED}Error: ${NC} $1" >&2 exit 1 } +#----------------------------------------------------------------------------------------- + +# Function to check that files were generated +function check_file() { + if [ -s "$1" ] + then + echo " wrote $1" + elif [ ! -s "$1" ] && [ -n "$2" ] # pass a second arg, like warn + then + echo -e "${RED}WARNING: could not write ${NC} $1" >&2 + else + echo -e "${RED}Error: could not write ${NC} $1" >&2 + exit 1 + fi +} +#----------------------------------------------------------------------------------------- # Function to check Bash version function check_bash_version() @@ -104,56 +133,9 @@ function print_start_date() } #----------------------------------------------------------------------------------------- -function filter_best_hits() -{ - # Processes blastp output to retain best hits for each query. - # sorts the hits for each query, to identify the one with the - # hightest bitscore (column 12 of standard tabular blastp -m 6), - # which is passed to filter_best_hits as its only parameter - - local infile n s - - # Declare arrays to store the names and best hits - declare -a names - declare -A max best - - infile=$1 - - # Read the input file and parse the fields - while read -r n f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 s; - do - if [[ -z "${max[$n]}" ]] - then - names+=( "$n" ) - fi - - # Compare the score for the current name to the current max value - # float comparisons using bc - if [[ ! "${max[$n]}" ]] || [[ 1 -eq "$(echo "$s > ${max[$n]}" | bc)" ]] - then - max["$n"]="$s" - unset 'best[$n]' - fi - - # If the current score is equal to the current max value, store the line - # float comparisons using bc - if [[ 1 -eq "$(echo "$s == ${max[$n]}" | bc)" ]] - then - best[$n]="${n}\t$f2\t$f3\t$f4\t$f5\t$f6\t$f7\t$f8\t$f9\t${f10}\t${f11}\t$s" - fi - done < "$infile" - - # Print the best hits for each name - for n in "${names[@]}"; do - [[ -z "${best[$n]}" ]] && continue - echo -e "${best[$n]}" - done -} -#----------------------------------------------------------------------------------------- - # Function to check dependencies function check_dependencies() { - local dependencies=("fas2tab.pl" "tab2fas.pl" "blastp" "makeblastdb") + local dependencies=("fas2tab.pl" "tab2fas.pl" "blastp" "makeblastdb" "blastdbcmd") for dep in "${dependencies[@]}"; do if ! command -v "$dep" &> /dev/null; then @@ -226,12 +208,26 @@ EOF } #----------------------------------------------------------------------------------------- +function run_blastp() +{ + local q db outfile + q=$1 + db=$2 + outfile=$3 + + blastp -query "$q" -db "$db" -matrix "$mat" -outfmt "$cols" -num_alignments $num_aln -num_threads $threads -qcov_hsp_perc "$qcov" \ + -seg "$seg" -soft_masking "$mask" -evalue "$Eval" > "$outfile" +} +#----------------------------------------------------------------------------------------- + + function print_help() { # Prints the help message explaining how to use the script. cat < [-e ] [-n ] [-q ] [-t ] [-D] [-v] + Usage: $progname -d [-e ] [-E ] [-m ] [-n ] [-q ] [-t ] + [-S SEG filtering] -M [ soft_masking] [-D] [-v] REQUIRED: -d path to directory containing the input proteomes (protein FASTA) @@ -239,9 +235,14 @@ function print_help() OPTIONAL -D print debugging info -e fasta file extension name [def:$ext] + -E E-value [def:$Eval] + -F <0|1> fix FASTA header for makeblastdb [def:$fix_header] -h print this help -n number of blast alignments [def:$num_aln] + -m matrix name [def:$mat] + -M soft_masking [def:$mask] -q minimum query coverage percentage cutoff [def:$qcov] + -S SEG filtering [def:$seg] -t number of threads for blastp runs [def:$threads] -v print version @@ -292,7 +293,7 @@ EOF args=("$@") -while getopts ':d:e:n:q:r:t:hDv?:' OPTIONS +while getopts ':d:e:E:F:m:M:n:q:r:S:t:hDv?:' OPTIONS do case $OPTIONS in @@ -300,14 +301,24 @@ do ;; e) ext=$OPTARG ;; + E) Eval=$OPTARG + ;; + F) fix_header=$OPTARG + ;; h) print_help ;; + m) mat=$OPTARG + ;; + M) mask=$OPTARG + ;; n) num_aln=$OPTARG ;; q) qcov=$OPTARG ;; r) ref=$OPTARG ;; + S) seg=$OPTARG + ;; t) threads=$OPTARG ;; v) print_version @@ -335,6 +346,13 @@ if [[ -z "$proteome_dir" ]]; then print_help fi +if [[ -z "$ref" ]] +then + ref_selection="auto" +else + ref_selection="$ref" +fi + ###>>> Exported variables !!! declare -x g=$genome_ID perl # export only2perl!!!; call as $ENV{genome_ID} @@ -358,8 +376,9 @@ $progname vers. $vers run on $hostn running $os with $nprocs processors and bash v.${bash_vers} at ${today/ /} using the following parameters: - proteome_dir=$proteome_dir | fasta_extension=$ext - - BLASTP params: blastp v.${blast_vers} | num_aln=$num_aln | qcov=$qcov | threads=$threads - - reference=$ref | DEBUG=$DEBUG + - BLASTP params: blastp v.${blast_vers} | num_aln=$num_aln | qcov=$qcov | threads=$threads | + Eval=$Eval | mat=$mat | seg=$seg | mask=$mask + - reference=$ref_selection | fix_header=$fix_header | DEBUG=$DEBUG - invocation: $progname ${args[*]} ===================================================================================================== " >&2 @@ -369,13 +388,13 @@ $progname vers. $vers # ------------------- # Main script logic starts here - # --------------------------------------- # 0. Validate user input & setup pipeline # --------------------------------------- # Check dependencies and Bash version check_dependencies -check_bash_version "$min_bash_vers" +v=$(check_bash_version "$min_bash_vers") +echo "# running under bash v.$v" # Move to proteome directory print_start_time && echo "# working in $proteome_dir" @@ -392,6 +411,7 @@ fi echo '-----------------------------------------------------------------------------------------------------' + # keep the ref as first element in fasta arrays declare -a non_ref infiles non_ref=( $(ls *"${ext}" | grep -v "$ref") ) @@ -401,254 +421,245 @@ infiles=("$ref" "${non_ref[@]}") # 1. edit headers for makeblastdb (blast+): Formatting and Indexing Proteomes # --------------------------------------------------------------------------- # The FASTA headers of all proteomes are edited using Perl to add a unique identifier. -print_start_time && echo "# formatting ${#infiles[@]} input FASTA files for indexing" -perl -pe 'if(/^>/){$c++; s/>/>lcl\|REF_$c /}' "$ref" > "${ref}ed" - -genome_ID=0 -g=0 -for f in "${non_ref[@]}"; do - genome_ID=$(( genome_ID + 1 )) - g="$genome_ID" - perl -pe 'if(/^>/){$c++; s/>/>lcl\|GENO$ENV{g}\_$c /}' "$f" > "${f}ed" -done +if ((fix_header == 1)) +then + print_start_time && echo "# formatting ${#infiles[@]} input FASTA files for indexing" + perl -pe 'if(/^>/){$c++; s/>/>lcl\|REF_$c /}' "$ref" > "${ref}ed" + + genome_ID=0 + g=0 + for f in "${non_ref[@]}"; do + genome_ID=$(( genome_ID + 1 )) + g="$genome_ID" + perl -pe 'if(/^>/){$c++; s/>/>lcl\|GENO$ENV{g}\_$c /}' "$f" > "${f}ed" + done +else + for f in *"${ext}" + do + ln -s "$f" "${f}ed" + done +fi # ------------------------------------------------------------------- # 2. run makeblastdb on all input proteomes with edited FASTA headers # ------------------------------------------------------------------- # Each proteome is used to create a blastp database using makeblastdb. print_start_time && echo "# Generating indexed blastp databases" + for f in *"${ext}"ed do - #Note: -parse_seqids results in query and subject IDs with different structure => lcl|A_DB_203 B_DB_166 + #Note: -parse_seqids results in query and subject IDs with different structure => lcl|A_DB_203 B_DB_166 makeblastdb -in "$f" -dbtype prot -parse_seqids &> /dev/null - - # The tabular output of the fas2tab.pl script is saved for each proteome. - fas2tab.pl "$f" | sed '/^$/d' > "${f}"tab done echo '-----------------------------------------------------------------------------------------------------' -# declar hashes holding [query]-target and [target]-query hits -declare -A AB_hits BA_hits - #----------------------------------- # 3. Run and process pairwise blastp #----------------------------------- -genome_ID=0 - -# declare hashes to hold non-redundant query and target hits -declare -A seen seen2 - # For each non-reference proteome, blastp is run against the reference proteome # (REFvsGENO) and vice versa (GENOvsREF). -for f in "${non_ref[@]}"; do - genome_ID=$(( genome_ID + 1 )) - print_start_time && echo "# running: blastp -seg yes -soft_masking true -query ${ref}ed -db ${f}ed -qcov_hsp_perc $qcov -outfmt 6 -num_alignments $num_aln -num_threads $threads > REFvsGENO${genome_ID}" - blastp -seg yes -soft_masking true -query "${ref}"ed -db "${f}"ed -qcov_hsp_perc "$qcov" -outfmt 6 -num_alignments "$num_aln" -num_threads "$threads" > REFvsGENO"${genome_ID}" - - print_start_time && echo "# running: blastp -seg yes -soft_masking true -query ${f}ed -db ${ref}ed -qcov_hsp_perc $qcov -outfmt 6 -num_alignments $num_aln -num_threads $threads > GENO${genome_ID}vsREF" - blastp -seg yes -soft_masking true -query "${f}"ed -db "${ref}"ed -qcov_hsp_perc "$qcov" -outfmt 6 -num_alignments "$num_aln" -num_threads "$threads" > GENO"${genome_ID}"vsREF - - - # 3.1 The best hits are extracted using the filter_best_hits function. - # Retrieve the highest-scoring hit out of the -num_alignments $num_aln hits - # Note: makeblastdb with -parse_seqids produces subject cols without the lcl| prfix: => lcl|A_DB_203 B_DB_166 - # and therefore we remove them to have subject and query IDs with the same structure, - # required for hash key comparisons later in the code - print_start_time && echo "# filter_best_hits REFvsGENO${genome_ID} > REFvsGENO${genome_ID}.best" - filter_best_hits REFvsGENO"${genome_ID}" | sed 's#lcl|##' > REFvsGENO"${genome_ID}".best - - print_start_time && echo "# filter_best_hits GENO${genome_ID}vsREF > GENO${genome_ID}vsREF.best" - filter_best_hits GENO"${genome_ID}"vsREF | sed 's#lcl|##' > GENO"${genome_ID}"vsREF.best +genome_ID=0 - # 3.2. Reciprocal best hits (RBHs) are determined by comparing the hits between the two blastp runs. - # Compute A vs B reciprocal best hits: - # the following hashes will hold query=>subject and subject=>query results - # that will be used to identify the reicprocal best hits (RBHs) - print_start_time && echo "# Computing REF vs GENO reciprocal best hits @ qcov=$qcov" +for f in "${non_ref[@]}"; do + genome_ID=$(( genome_ID + 1 )) - #REF_1 GENO1_92 100.000 234 0 0 5 238 1 234 1.18e-180 488 - #REF_2 GENO1_141 100.000 278 0 0 1 278 1 278 0.0 569 - while read -r q subj rest - do - AB_hits["$q"]="$subj" - done < REFvsGENO"${genome_ID}".best - + ref_vs_geno_blastout=${ref%.*}vs${f%.*}_best_hits.tmp + geno_vs_ref_blastout=${f%.*}vs${ref%.*}_best_hits.tmp - #GENO1_1 REF_52 99.441 179 1 0 10 188 1 179 1.65e-131 359 - #GENO1_2 REF_53 100.000 95 0 0 1 95 1 95 6.70e-68 191 - while read -r q subj rest - do - (( seen[$q]++ )) - if (( ${seen[$q]} == 1 )); then - BA_hits["$subj"]="$q" - fi - done < GENO"${genome_ID}"vsREF.best + print_start_time && echo "# running: blastp -seg yes -soft_masking true -query ${ref}ed -db ${f}ed -qcov_hsp_perc $qcov -outfmt $cols -num_alignments $num_aln -num_threads $threads -evalue $Eval -matrix $mat > $ref_vs_geno_blastout" - # 3.3 find the RBHs by comparing the two hashes and sort output by A_DB_# key - RBH_outfile="${ref%.*}"_vs_"${f%.*}"_qcov"${qcov}"_RBHs_2col.tsv - for kA in "${!AB_hits[@]}" - do - (( seen2[${AB_hits[$kA]}]++ )) - if [[ "${BA_hits[$kA]}" == "${AB_hits[$kA]}" ]] && (( "${seen2[${AB_hits[$kA]}]}" == 1 )) - then - printf "%s\t%s\n" "$kA" "${AB_hits[$kA]}" - else - continue - fi - done | sort -k1.6g > "$RBH_outfile" + blastp -seg yes -soft_masking true -query "${ref}"ed -db "${f}"ed -qcov_hsp_perc "$qcov" -outfmt "$cols" -num_alignments "$num_aln" \ + -num_threads "$threads" -evalue "$Eval" -matrix "$mat" > "$ref_vs_geno_blastout" + + check_file "$ref_vs_geno_blastout" + # Retrieve the best nonREF proteome database hits using blastdbcmd, onlfy if qcov > \$qcov + print_start_time && echo "# retrieving the best hits from $ref_vs_geno_blastout with blastdbcmd ..." + blastdbcmd -entry_batch <(awk -F"\t" -v qcov=$qcov '$8 > qcov{print $2}' "$ref_vs_geno_blastout" | sort -u) -db "${f}"ed > "${ref%.*}vs${f%.*}"_besthits.faa + + check_file "${ref%.*}vs${f%.*}"_besthits.faa + + num_hits=$(grep -c '^>' "${ref%.*}vs${f%.*}"_besthits.faa) + + if ((num_hits == 0)) + then + echo "WARNING: no hits in ${ref%.*}vs${f%.*}"_besthits.faa" + rm ${ref%.*}vs${f%.*}"_besthits.faa + continue + fi - # 3.4 make a more informative RBH_outfile name, including number of RBHs found - num_RBHs=$(wc -l "$RBH_outfile" | awk '{print $1}') - RBH_out="${ref%.*}"_vs_"${f%.*}"_qcov"${qcov}"_${num_RBHs}RBHs_2col.tsv - mv "$RBH_outfile" "$RBH_out" - RBH_outfile="$RBH_out" - print_start_time && echo "# Found $num_RBHs reciprocal best hits between $ref and $f at qcov=${qcov}%" - echo '===' + print_start_time && echo "# running: blastp -seg yes -soft_masking true -query ${ref%.*}vs${f%.*}_besthits.faa -db ${ref}ed -qcov_hsp_perc $qcov -outfmt $cols -num_alignments $num_aln -num_threads $threads -evalue $Eval -matrix $mat > $geno_vs_ref_blastout" + + blastp -seg yes -soft_masking true -query "${ref%.*}vs${f%.*}"_besthits.faa -db "${ref}"ed -qcov_hsp_perc "$qcov" -outfmt "$cols" -num_alignments "$num_aln" \ + -num_threads "$threads" -evalue "$Eval" -matrix "$mat"> "$geno_vs_ref_blastout" + + check_file "$geno_vs_ref_blastout" + + # Sort the blastp output table from the preceding search by increasing E-values (in column 9) and decreasing scores (col 10) + # & filter out unique REF vs nonREF RBHs using AWK hashes from the sorted blast output table with qcov > $qcov + print_start_time && echo "# filtering out unique REF vs nonREF RBHs from the sorted blast output table with qcov > $qcov" + for GENOid in $(cut -f1 "$geno_vs_ref_blastout" | sort -u) + do + grep "$GENOid" "$ref_vs_geno_blastout" + done | sort -gk9,9 -gk10,10 | \ + awk -v qcov=$qcov 'BEGIN{FS=OFS="\t"}{REFid[$1]++; GENOid[$2]++; if(REFid[$1] == 1 && GENOid[$2] == 1 && $8 > qvov) print }' > \ + "${ref_vs_geno_blastout%.*}"_RBHs_qcov_gt"${qcov}".tsv + + check_file "${ref_vs_geno_blastout%.*}"_RBHs_qcov_gt"${qcov}".tsv ]] done echo '-----------------------------------------------------------------------------------------------' -#-------------------------------------------------------------------- -# 4. Loop over tsv files and count the number of hits for each REF_ID -#-------------------------------------------------------------------- -# Cluster Computation and Output -print_start_time "# Computing clusters of homologous sequences" +#----------------------------------------------------------------------- +# 4. Identify REF proteins shared by all tsv files holding pairwise RBHs +#----------------------------------------------------------------------- +# Find the intersections of REFs in all tsv files +print_start_time && echo "# computing the intersections of REF proteins in all tsv files holding pairwise RBHs" +awk '{r[$1]++; if(r[$1] == ARGC-1) print $1}' ./*.tsv > REF_RBH_IDs.list +[[ ! -s REF_RBH_IDs.list ]] && error "could not write REF_RBH_IDs.list" -# A hash (core_ref) counts the occurrences of each reference ID in the RBH tables. -declare -A core_ref # hash counting the instances of the REFERNCE_IDs in the RBH tables +intersection_size=$(wc -l REF_RBH_IDs.list | awk '{print $1}') +((intersection_size > 0)) && echo "# found $intersection_size core RBHs shared by ${#non_ref[*]} nonREF proteomes with the $ref reference proteome" +((intersection_size == 0)) && error "# ERROR: found $intersection_size core orhtologous genes among ${#infiles[*]} input proteomes ..." -# For each RBH table, the RBH clusters are extracted and stored in cluster_X.fastab files. -for f in *RBHs_2col.tsv; do - while read -r REF QUERY - do - # count the instances of REFERNCE_IDs in each RBHs_2col.tsv tables - (( core_ref["$REF"]++ )) - done < "$f" -done - -((DEBUG > 0 )) && echo "# DEBUG: Loop over tsv files and count the number of hits for each REF_ID; \${#non_ref[@]}: ${#non_ref[@]}" >&2 \ - && for k in "${!core_ref[@]}"; do (("${core_ref[$k]}" == "${#non_ref[@]}")) && echo -e "$k\t${core_ref[$k]}"; done >&2 - -# A list of reference IDs shared with all genomes is created (ref_orth_IDs). -declare -a ref_orth_IDs # array holding the REFERNCE_IDs found in all RBH tables +#------------------------------------------------------------------------------------------------------------------------- +# 5. Loop over tsv files and generate RBHs_matrix.tsv core_genome_clusters.tsv and nonCore_genome_clusters.tsv tables +#------------------------------------------------------------------------------------------------------------------------- +# Cluster Computation -# ref_orth_IDs contains the reference_IDs of orthologous loci -# shared by the REFERENCE with all genomes; -ref_orth_IDs=( $(for k in "${!core_ref[@]}"; do (("${core_ref[$k]}" == "${#non_ref[@]}")) && printf '%s\n' "$k"; done) ) +print_start_time && echo "# Computing clusters of homologous sequences" -# sort the ref_orth_IDs array; note sort -k1.5g for IDs like REF_58 -ref_orth_IDs=( $(printf '%s\n' "${ref_orth_IDs[@]}" | sort -k1.5g) ) -((DEBUG > 0 )) && echo "# DEBUG: the sorted \${ref_orth_IDs[@]} array:" >&2 \ - && echo "${ref_orth_IDs[@]}" >&2 +# The core_ref hash counts the instances of the REFERNCE_IDs in the RBH tables +declare -A core_ref +# The pangenome_clusters hash is indexed by REFERNCE_IDs +# and as its value holds the RBHs as a tab-separated +# string of IDs from nonREF proteomes +declare -A pangenome_clusters -echo "# found ${#ref_orth_IDs[@]} RBH clusters for ${#non_ref[@]} genomes against $ref" -printf '%s\n' "${ref_orth_IDs[@]}" > "${ref%.*}"_core_IDs.list - -for f in *RBHs_2col.tsv; do - base="${f%RBHs_2col.tsv}" - for id in "${ref_orth_IDs[@]}" +# Construct the pangenome_clusters hash, indexed by reference proteome, +# containing a value a string of tab-separated nonREF proteome RBH IDs. +print_start_time && echo "# populating the pangenome_clusters hash ..." +for t in *RBHs_*.tsv; do + while read -r REF QUERY rest do - grep -w "$id" "$f" | cut -f2 - done > "${base}"_core_IDs.list + # count the instances of REFERNCE_IDs in each RBHs_*.tsv tables + (( core_ref["$REF"]++ )) + if [[ ${core_ref["$REF"]} -eq 1 ]] + then + pangenome_clusters["$REF"]="$QUERY" + else + pangenome_clusters["$REF"]="${pangenome_clusters[$REF]}\t$QUERY" + fi + done < "$t" done -print_start_time && echo "# Generating a table of ortholog IDs shared with $ref" -paste ./*core_IDs.list > ORTHOLOG_IDs_SHARED_WITH_REF.tsv -# ORTHOLOG_IDs_SHARED_WITH_REF.tsv -#REF_1 GENO1_496 GENO2_843 -#REF_3 GENO1_498 GENO2_845 -#REF_4 GENO1_500 GENO2_847 - -#-------------------------- -# 5. Write cluster fastas -#-------------------------- -# The clusters are split into "core" and "non-core" based on the number of genomes sharing the locus. -print_start_time && echo '# Generating FASTA files for orthologous clusters' +# 5.1 print the RBHs_matrix.tsv +print_start_time && echo "# printing the pangenome_matrix, core_genome_clusters and nonCore_genome_clusters files" +for key in "${!pangenome_clusters[@]}" +do + echo -e "${key}\t${pangenome_clusters[$key]}" +done > RBHs_matrix.tsv +check_file RBHs_matrix.tsv -declare -a non_ref_faaedtab_files faaedtab_files -ref_faaedtab=$(ls "${ref}"edtab) -non_ref_faaedtab_files=( $(ls *."${ext}"edtab | grep -v "${ref}"edtab) ) -# put the ref_faaedtab in the first position idx[0] of the faaedtab_files array -faaedtab_files=( "$ref_faaedtab" "${non_ref_faaedtab_files[@]}" ) +# 5.2 print the core_genome_clusters.tsv +awk -v ninfiles="${#infiles[@]}" 'NF == ninfiles' RBHs_matrix.tsv > core_genome_clusters.tsv +check_file core_genome_clusters.tsv +# 5.3 print the nonCore_genome_clusters.tsv +awk -v ninfiles="${#infiles[@]}" 'NF != ninfiles' RBHs_matrix.tsv > nonCore_genome_clusters.tsv +check_file nonCore_genome_clusters.tsv warn -# 5.1 reconstitute cluster fastas -# NOTE: calling blasdbcmd multiple times in a loop to retrieve -# the cluster sequences is quite slow. -# Therefore they're grepped out of the *edtab files -declare -A seen3 -c=0 -while read -r -a ids; do - c=$((c + 1)) - # grep out the reference sequence, stored at idx[0] - grep -w "${ids[0]}" "${faaedtab_files[0]}" > cluster_"${c}".fastab - - # next process the non ref sequences stored at idxs >= 1 - # and append them to the growing cluster_"${c}".fastab file - for (( idx=1; idx <= ((${#ids[@]} -1)); idx++)); do - (( seen3[${ids[$idx]}]++ )) - ((DEBUG > 0)) && echo "DEBUG: idx:$idx; c=$c; grep -w ${ids[$idx]} ${faaedtab_files[$idx]}" >&2 - if (( ${seen3[${ids[$idx]}]} == 1 )); then - grep -w "${ids[$idx]}" "${faaedtab_files[$idx]}" >> cluster_"${c}".fastab - fi - done -done < ORTHOLOG_IDs_SHARED_WITH_REF.tsv +#----------------------------- +# 6. Write cluster FASTA files +#----------------------------- +# 6.1 read all source FASTA file into memory (as the hash seqs) for later filtering +print_start_time && echo "# reading all source FASTA files into memory (as the hash seqs) ..." -((DEBUG == 0)) && [[ -s ORTHOLOG_IDs_SHARED_WITH_REF.tsv ]] && rm ./*core_IDs.list -[[ ! -s ORTHOLOG_IDs_SHARED_WITH_REF.tsv ]] && error "could not write ORTHOLOG_IDs_SHARED_WITH_REF.tsv" +declare -A seqs; -# Convert fastab files back to FASTA -for f in cluster_*.fastab +for f in "${infiles[@]}" do - tab2fas.pl "$f" > "${f%tab}" + while read -r l + do + if [[ "$l" =~ ^\> ]] # line contains the FASTA header + then + key=$l + else # concatenate sequence lines + seqs["$key"]="${seqs[$key]}""${l}" + fi + done < "$f" done -# Cluster (RBH) FASTA files are moved to appropriate directories (core and non_core). -clusters_dir=${ref%.*}_vs_${#non_ref_faaedtab_files[@]}genomes_qcov"${qcov}"_"${#ref_orth_IDs[@]}"_clusters -[[ -d "$clusters_dir" ]] && rm -rf "$clusters_dir" -[[ ! -d "$clusters_dir" ]] && mkdir "$clusters_dir" || error "could not generate dir $clusters_dir" +# 6.2 filter the hash seqs with cluster keys saved in each line of RBHs_matrix.tsv +# for k in "${!seqs[@]}"; do if [[ $k =~ \>Q8MYF2 ]]; then echo -e "$k\n${seqs[$k]}"; fi; done < test.faa +print_start_time && echo "# Filtering the seqs hash and writing RBH cluster FASTA files ..." -print_start_time && echo "# moving cluster FASTA files and tables to $clusters_dir" -mv cluster*.fas GENO*.best REF*.best ORTHOLOG_IDs_SHARED_WITH_REF.tsv "$clusters_dir" +#initialize cluster counter +c=0 -cd "$clusters_dir" || error "could not cd into $clusters_dir" -mkdir core || error "could not mkdir core" -mkdir non_core || error "could not mkdir non_core" +# read each line of the RBHs_matrix.tsv +# into the ids array using the read -r -a idiom, +# and filter the seqs hash with the corresponding IDs as keys +# to write out the cluster_x.fas files +# NOTE: this is very slow, even with the continue 2 statement; should benchmark filtering fastab with grep +while read -r -a ids +do + ((c++)) + # iterate over all indexes of the idx array + # and append them to the growing cluster_"${c}".fastab file + for (( idx=0; idx <= ((${#ids[@]} -1)); idx++)); do + # iterate of all source sequence FASTA headers (k) + # and print out only sequences matching the ids + # in cluster c, concatenating them '>>' to cluster_"${c}".fastab + for k in "${!seqs[@]}" + do + if [[ "$k" =~ "${ids[$idx]}" ]] + then + ((DEBUG > 0)) && echo "DEBUG (write clusters): k:$k; idx:$idx; ids[$idx]}:${ids[$idx]}; c=$c" >&2 + echo -e "$k\n${seqs[$k]}" + continue 2 + fi + done >> cluster_"${c}".fas + done +done < RBHs_matrix.tsv -core_loci=0 -non_core_loci=0 -for f in *fas; do - if grep -c '^>' "$f" | grep "${#infiles[@]}" &> /dev/null; then - core_loci=$((core_loci + 1)) - mv "$f" core - else - non_core_loci=$((non_core_loci + 1)) - mv "$f" non_core - fi + +# 6.3 filter out core and nonCore clusters +mkdir core_clusters || error "could not mkdir core_clusters" +mkdir nonCore_clusters || error "could not mkdir nonCore_clusters" +for f in cluster_*.fas +do + if [[ $(grep -c '^>' "$f") -eq "${#infiles[@]}" ]] + then + mv "$f" core_clusters/core_"${f}" || error "could not mv $f to core_clusters/core_${f}" + else + mv "$f" nonCore_clusters/nonCore_"${f}" || error "could mv $f to nonCore_clusters/nonCore_${f}" + fi done -print_start_time && echo "# Moved $core_loci core FASTA files to $clusters_dir/core" -print_start_time && echo "# Moved $non_core_loci non-core FASTA files to $clusters_dir/non_core" -# ---------------- -# 6. final cleanup -# ---------------- -# Unnecessary files generated during the process are removed. -cd "$wkdir" || error "could not cd into $wkdir" +#---------------- +# 7. final cleanup +#---------------- +#Unnecessary files generated during the process are removed. print_start_time && echo "# final cleanup in $wkdir" -((DEBUG == 0)) && rm ./*fastab GENO*vsREF REFvsGENO[[:digit:]]* ./*"${ext}"ed* ./*RBHs_2col.tsv -echo +((DEBUG == 0)) && rm *faaed *faaed.* *best_hits.tmp REF_RBH_IDs.list *besthits.faa elapsed=$(( SECONDS - start_time )) eval "echo Elapsed time: $(date -ud "@$elapsed" +'$((%s/3600/24)) days, %H hr, %M min, %S sec')" print_end_message + + +exit 0 + + +