_run_gbs_qc

#!/bin/sh
#
# master interactive script for processing a novaseq run through bclconvert , database imports and GBS analysis
#

export SEQ_PRISMS_BIN=/dataset/gseq_processing/active/bin/gbs_prism/seq_prisms 
export GBS_PRISM_BIN=/dataset/gseq_processing/active/bin/gbs_prism
BCLCONVERT_NODE=invbfopp10.agresearch.co.nz   # iramohio-01

GBS_BACKUP_DIR=/dataset/gseq_processing/archive/backups

function send_mail() {
   message="$1"
   echo "sending mail"
   echo "" | mutt -s "$message" vanstijnt , mccullocha, bairdh, perrybe, andersonr, andrewsa, henryh, frenchm, hicklandm
}


function read_answer_with_default() {
   if [ $INTERACTIVE == yes ]; then
      read answer
      echo "User response: $answer"
      if [ -z "$answer" ]; then
         answer=$@
         echo "Default response: $answer"
      fi
   else
      answer=$@
      echo "Default response: $answer"
   fi
}

function more_with_default() {
   if [ $INTERACTIVE == yes ]; then
      more $1
   else
      cat $1
   fi
}

function get_opts() {
   INTERACTIVE=no
   INTERACTIVE_OPT=""
   ARGRUN=""
   help_text="
This script is called by run_gbs_qc (or non-interactively by a cron job)
usage :\n
"
   while getopts ":hir:" opt; do
   case $opt in
       h)
         echo -e $help_text
         exit 0
         ;;
       i)
         INTERACTIVE=yes
         INTERACTIVE_OPT="-i"
         ;;
       r)
         ARGRUN=$OPTARG
         ;;
       \?)
         echo "Invalid option: -$OPTARG" >&2
         exit 1
         ;;
       :)
         echo "Option -$OPTARG requires an argument." >&2
         exit 1
         ;;
     esac
   done

   shift $((OPTIND-1))

   gbs_version=$1
   kgd_version=$2
}


function run_bclconvert() {
   if [ $HOSTNAME != $BCLCONVERT_NODE ]; then
      echo "sorry you need to be logged on to iramohio-01 to run bclconvert"
      exit 1
   fi

   echo "checking run is completed (i.e. looking for $NOVASEQ_ROOT/$RUN/RTAComplete.txt)"
   if [ ! -f $NOVASEQ_ROOT/$RUN/RTAComplete.txt ]; then
      echo "*** landmark file $NOVASEQ_ROOT/$RUN/RTAComplete.txt does not exist => this run has not completed sequencing (or uploading?) - are you SURE you want to continue !? (y/n default n)"
      read_answer_with_default n
      if [ $answer != "y" ]; then
         echo "OK quitting"
         exit 1
      else
         echo "OK will continue but note that output may be incomplete"
      fi
   fi

   bclconvert_phrase="" # not currently used - previously used to pass in bcl2fastq options 
   samplesheet_to_fastqnames_phrase="-I 1,2 -t single_end"

   # set up for bclconvert run
   mkdir -p $BCLCONVERT_ROOT/$RUN/SampleSheet
   if [ ! -d $BCLCONVERT_ROOT/$RUN/SampleSheet ]; then
      echo "could not create bclconvert output folder $BCLCONVERT_ROOT/$RUN/SampleSheet , quitting"
      exit 1
   fi

   ####### ensure sample sheet used for bclconvert contains header, and includes specification of adapter
   if [ $platform == "novaseq" ]; then
      cat $SAMPLE_SHEET | $GBS_PRISM_BIN/add_sample_sheet_header.py -H  $GBS_PRISM_BIN/etc/sample_sheet_header.csv  > $BCLCONVERT_ROOT/$RUN/SampleSheet.csv
   else
      cat $SAMPLE_SHEET | $GBS_PRISM_BIN/add_sample_sheet_header.py -H  $GBS_PRISM_BIN/etc/sample_sheet_header.csv  > $BCLCONVERT_ROOT/$RUN/SampleSheet.csv
   fi

   ###### ensure output folder does not exist
   if [ -d $BCLCONVERT_ROOT/$RUN/SampleSheet/bclconvert ]; then
      echo "
*** error *** 
$BCLCONVERT_ROOT/$RUN/SampleSheet/bclconvert already exists - please clean up and retry"
      exit 1
   fi

   echo "about to run bclconvert using

   $SEQ_PRISMS_BIN/sequencing_qc_prism.sh -a bclconvert -I $NOVASEQ_ROOT/$RUN -B \"$bclconvert_phrase\" -Q \"$samplesheet_to_fastqnames_phrase\" -O $BCLCONVERT_ROOT/$RUN/SampleSheet $BCLCONVERT_ROOT/$RUN/SampleSheet.csv  > $BCLCONVERT_ROOT/$RUN/SampleSheet/bclconvert.log  2>&1

   OK ? (y/n, default=y)
   "
   read_answer_with_default  y
   if [ $answer != "y" ]; then
      echo "OK quitting"
      exit 1
   fi
   echo "
   starting bclconvert, this should take around an hour or two...

   "

   echo "running $SEQ_PRISMS_BIN/sequencing_qc_prism.sh -a bclconvert -I $NOVASEQ_ROOT/$RUN -B \"$bclconvert_phrase\" -Q \"$samplesheet_to_fastqnames_phrase\"  -O $BCLCONVERT_ROOT/$RUN/SampleSheet $BCLCONVERT_ROOT/$RUN/SampleSheet.csv " > $BCLCONVERT_ROOT/$RUN/SampleSheet/bclconvert.log

   $SEQ_PRISMS_BIN/sequencing_qc_prism.sh -a bclconvert -I $NOVASEQ_ROOT/$RUN -B "$bclconvert_phrase" -Q "$samplesheet_to_fastqnames_phrase"  -O $BCLCONVERT_ROOT/$RUN/SampleSheet $BCLCONVERT_ROOT/$RUN/SampleSheet.csv  >> $BCLCONVERT_ROOT/$RUN/SampleSheet/bclconvert.log 2>&1

   if [ $? != 0 ]; then

      if [ $INTERACTIVE != yes ]; then
         send_mail "sorry bclconvert for $RUN exited with an error code"
      fi

      echo "

      bclconvert has finished but received a non zero process exit code from sequencing_qc_prism - do
      you want to continue ? (y/n, default = n)
      "
      read_answer_with_default n 
      if [ $answer != "y" ]; then
         echo "OK quitting"
         exit 1
      fi
   fi
   

   if [ $INTERACTIVE != yes ]; then
      send_mail "(bclconvert for $RUN completed ok - fastq data is now available)"
   fi

   echo "

   bclconvert completed ok 

   "
}


function run_lane_qc() {
   echo "

   finding sequence files for fastqc..."
   mkdir -p $BCLCONVERT_ROOT/$RUN/SampleSheet
   find $BCLCONVERT_ROOT/$RUN/SampleSheet/bclconvert -name "*.fastq.gz" -size +1000c -print | grep -vi Undetermined > $BCLCONVERT_ROOT/$RUN/SampleSheet/sequence_files.txt
   
   if [ $? != 0 ]; then
      echo "run_lane_qc: oops could not find any fastq files under $BCLCONVERT_ROOT/$RUN/SampleSheet/bclconvert ! giving up"
      exit 1
   fi  

   echo "
   export SEQ_PRISMS_BIN=/dataset/gseq_processing/active/bin/gbs_prism/seq_prisms
   export GBS_PRISM_BIN=/dataset/gseq_processing/active/bin/gbs_prism

   mkdir -p $BCLCONVERT_ROOT/$RUN/SampleSheet/fastqc_run
   nohup $SEQ_PRISMS_BIN/sequencing_qc_prism.sh -a fastqc -O $BCLCONVERT_ROOT/$RUN/SampleSheet/fastqc_run \`cat $BCLCONVERT_ROOT/$RUN/SampleSheet/sequence_files.txt\` > $BCLCONVERT_ROOT/$RUN/SampleSheet/fastqc_run/fastqc.log 2>&1  &
   mkdir -p $BCLCONVERT_ROOT/$RUN/SampleSheet/kmer_run
   nohup $SEQ_PRISMS_BIN/sequencing_qc_prism.sh -a kmer_analysis -s .0002 -M 10000 -O $BCLCONVERT_ROOT/$RUN/SampleSheet/kmer_run \`cat $BCLCONVERT_ROOT/$RUN/SampleSheet/sequence_files.txt\` > $BCLCONVERT_ROOT/$RUN/SampleSheet/kmer_run/kmer_analysis.log 2>&1  &
   " > $BCLCONVERT_ROOT/$RUN/SampleSheet/lane_qc.src
   echo "

   about to start fastqc and kmer_analysis in the background using 
"
   cat $BCLCONVERT_ROOT/$RUN/SampleSheet/lane_qc.src 
   echo "
OK ? (y/n, default=y)
   "
   read_answer_with_default y
   if [ $answer != "y" ]; then
      echo "OK quitting"
      exit 1
   fi
   echo "

   running fastqc and kmer analysis in background, then will continue with GBS q/c...

   "
   source $BCLCONVERT_ROOT/$RUN/SampleSheet/lane_qc.src 
}

function run_dedupe() {
   ls $BCLCONVERT_ROOT/$RUN/SampleSheet/bclconvert/*.fastq.gz | grep -vi undetermined > $BCLCONVERT_ROOT/$RUN/SampleSheet/files_to_dedupe.txt

   echo "
   About to start dedupe using : 

   $SEQ_PRISMS_BIN/sequencing_qc_prism.sh -a dedupe -D \"dedupe optical dupedist=15000 subs=0\" -T /dataset/gseq_processing/itmp/illumina -O $BCLCONVERT_ROOT/$RUN/SampleSheet \`cat $BCLCONVERT_ROOT/$RUN/SampleSheet/files_to_dedupe.txt\`  > $BCLCONVERT_ROOT/$RUN/SampleSheet/dedupe.log 2>&1

OK ? (y/n, default=y)
   "
   read_answer_with_default y
   if [ $answer != "y" ]; then
      echo "OK quitting"
      exit 1
   fi
   echo "

   running dedupe

   "
   echo "
   export SEQ_PRISMS_BIN=/dataset/gseq_processing/active/bin/gbs_prism/seq_prisms
   export GBS_PRISM_BIN=/dataset/gseq_processing/active/bin/gbs_prism

   $SEQ_PRISMS_BIN/sequencing_qc_prism.sh -a dedupe -D \"dedupe optical dupedist=15000 subs=0\" -T /dataset/gseq_processing/itmp/illumina -O $BCLCONVERT_ROOT/$RUN/SampleSheet \`cat $BCLCONVERT_ROOT/$RUN/SampleSheet/files_to_dedupe.txt\`  > $BCLCONVERT_ROOT/$RUN/SampleSheet/dedupe.log 2>&1
   " > $BCLCONVERT_ROOT/$RUN/SampleSheet/restart_dedupe.src

   echo "

   running dedupe (to restart or rerun this step , you can 
  
   source $BCLCONVERT_ROOT/$RUN/SampleSheet/restart_dedupe.src

   -this should take about an hour 
   "
   $SEQ_PRISMS_BIN/sequencing_qc_prism.sh -a dedupe -D "dedupe optical dupedist=15000 subs=0" -T /dataset/gseq_processing/itmp/illumina -O $BCLCONVERT_ROOT/$RUN/SampleSheet `cat $BCLCONVERT_ROOT/$RUN/SampleSheet/files_to_dedupe.txt`  > $BCLCONVERT_ROOT/$RUN/SampleSheet/dedupe.log 2>&1

   if [ $? != 0 ]; then

      if [ $INTERACTIVE != yes ]; then
         send_mail "sorry dedupe for $RUN exited with an error code"
      fi

      echo "

      dedupe has finished but received a non zero process exit code from sequencing_qc_prism - do
      you want to continue ? (y/n, default = n)
      "
      read_answer_with_default n
      if [ $answer != "y" ]; then
         echo "OK quitting"
         exit 1
      fi
   fi

}


function get_run_opts() {

   DRY_RUN=no
   DEBUG=no
   HPC_TYPE=slurm
   FILES=""
   OUT_ROOT=""
   SNP_ENGINE=tassel        # the only one supported at this point

   NOVASEQ_ROOT=/dataset/2024_illumina_sequencing_e/active
   NOVASEQ_PROCESSING_ROOT=/dataset/2024_illumina_sequencing_e/scratch/postprocessing/gbs
   NOVASEQ_BCLCONVERT_ROOT=/dataset/2024_illumina_sequencing_e/scratch/postprocessing/illumina/novaseq
   mkdir -p $NOVASEQ_PROCESSING_ROOT
   mkdir -p $NOVASEQ_BCLCONVERT_ROOT


   echo "*** gbs analysis version $gbs_version (KGD version $kgd_version) (using SNP engine = $SNP_ENGINE) ***
* note that you can paste into your terminal window by clicking your right mouse button
* at any stage you can press CTRL-C to exit the dialogs
* if you would prefer to run a single batch command, use ag_gbs_qc_prism.sh (-h for help) - e.g. 
  this allows you to run the analysis on any collection of input files
"

   ####### get and check RUN
   while [ 1 ] ; do
      echo "
please give the full name of gbs run you would like to process (e.g. 210712_A01439_0006_AHC7MJDRXY (novaseq) 

"
      read_answer_with_default $ARGRUN
      RUN=$answer

      if [ ! -z "$RUN" ]; then 
         platform="unknown"
         if [ -d $NOVASEQ_ROOT/$RUN ]; then 
            platform=novaseq
            gbs_ROOT=$NOVASEQ_ROOT
            PROCESSING_ROOT=$NOVASEQ_PROCESSING_ROOT
            BCLCONVERT_ROOT=$NOVASEQ_BCLCONVERT_ROOT
            break
         else 
            echo "sorry can't find $RUN under $NOVASEQ_ROOT"
         fi
      fi

   done

   echo "will process $gbs_ROOT/$RUN"


   ####### find the sample sheet  - e.g. could be HNFW2DRXY.csv or SampleSheet.csv
   SAMPLE_SHEET=$NOVASEQ_ROOT/$RUN/SampleSheet.csv
   if [ ! -f $SAMPLE_SHEET ]; then
      SAMPLE_SHEET=`ls $NOVASEQ_ROOT/$RUN/*.csv`  
      base=`basename $SAMPLE_SHEET .csv`
      echo $RUN | grep $base > /dev/null 2>&1
      if [ $? != 0 ]; then
         echo "sorry can't find exactly one sample-sheet for this run under $NOVASEQ_ROOT/$RUN"
      fi
   fi

   ####### try to find the design-time sample sheet, and compare 
   DESIGN_TIME_PATH=$NOVASEQ_ROOT/SampleSheets/${base}.csv 
   if [ ! -f $DESIGN_TIME_PATH ]; then
      echo "(warning , could not find design-time sample sheet where expected ( $DESIGN_TIME_PATH ) , so unable to check for changes)"
   else
      diff $DESIGN_TIME_PATH $SAMPLE_SHEET > /dev/null
      if [ $? != 0 ]; then
         echo "

***** WARNING ******
There are some differences between the design-time sample sheet at $DESIGN_TIME_PATH and the run-time sample-sheet at $SAMPLE_SHEET .

Press any key to list differences
"
         read_answer_with_default y 
         diff $DESIGN_TIME_PATH $SAMPLE_SHEET
         echo "

are you OK to continue with the run-time version (i.e. the one having differences prefixed with > ) ? (y/n default y)"
         read_answer_with_default y
         if [ $answer != "y" ]; then
            echo "OK quitting"
            exit 1
         fi
      fi
   fi


   ####### check whether we can find sequence data - if not confirm  bclconvert is needed 
   echo "checking sequence data (looking under  $BCLCONVERT_ROOT/$RUN)...."
   ls -lR $BCLCONVERT_ROOT/$RUN/SampleSheet/bclconvert 2>/dev/null | grep "fastq.gz"   > /dev/null 2>&1 
   if [ $? != 0 ]; then
      echo "could not find fastq data under $BCLCONVERT_ROOT/$RUN so looks like bclconvert is needed - OK to run that ? (y/n, default=y)"
      read_answer_with_default y 
      if [ "$answer" != "y" ]; then
         echo "OK continuing..."
      else
         run_bclconvert 
      fi
   fi

   ####### check whether we can find lane q/c landmark - if not confirm 
   echo "checking for lane q/c results (looking for landmarks $BCLCONVERT_ROOT/$RUN/SampleSheet/fastqc_run/*.fastqc and $BCLCONVERT_ROOT/$RUN/SampleSheet/kmer_run/qc.kmer_analysis )...."
   ls $BCLCONVERT_ROOT/$RUN/SampleSheet/fastqc_run/*.fastqc  $BCLCONVERT_ROOT/$RUN/SampleSheet/kmer_run/qc.kmer_analysis > /dev/null 2>&1
   if [ $? != 0 ]; then 
      echo "could not find all lane q/c landmarks so looks like lane q/c is needed - OK to run that ? (y/n, default=y)"
      read_answer_with_default y
      if [ "$answer" != "y" ]; then
         echo "OK continuing..."
      else
         run_lane_qc
      fi
   fi

   ####### check whether we can find deduped data   - if not confirm
   echo "checking for deduped data (looking for $BCLCONVERT_ROOT/$RUN/SampleSheet/dedupe/*.fastq.gz )...."
   ls $BCLCONVERT_ROOT/$RUN/SampleSheet/dedupe/*.fastq.gz > /dev/null 2>&1
   if [ $? != 0 ]; then
      echo "could not find deduped fastq files (i.e. $BCLCONVERT_ROOT/$RUN/SampleSheet/dedupe/*.fastq.gz) so looks like dedupe is needed - OK to run that ? (y/n, default=y)"
      read_answer_with_default y
      if [ "$answer" != "y" ]; then
         echo "OK continuing..."
      else
         run_dedupe
      fi
   fi

   ######## continue on with GBS q/c ###########
   # set up output folder
   while [ 1 ]; do
      echo "

      please specify GBS q/c output base folder (or just press ENTER to use default , $PROCESSING_ROOT/$RUN)"
      read_answer_with_default $PROCESSING_ROOT/$RUN
      NEW_ROOT=$answer
      if [ -d $NEW_ROOT ]; then
         echo "warning - $NEW_ROOT already exists, use anyway ? (y/n, default=y)"
         read_answer_with_default y
         if [ $answer == "y" ]; then
            OUTPUT_ROOT=$NEW_ROOT
            break
         fi
      else
         mkdir -p $NEW_ROOT
         if [ -d $NEW_ROOT ]; then
            OUTPUT_ROOT=$NEW_ROOT
            break
         fi
      fi
   done

   PARAMETERS_FILE=$OUTPUT_ROOT/SampleProcessing.json
   echo "will use output root folder $OUTPUT_ROOT

   "

   ####### check whether run is in database  - import it if not (this also generates keyfiles)
   echo "

checking if run is in the database... 

"
   gquery  -t lab_report -p name=illumina_run_details $RUN  > /dev/null 2>&1
   if [ $? != "0" ]; then
      echo "$RUN is not in the database - will generate keyfiles and set up run in the database first. About to execute  

gupdate --explain -t create_gbs_keyfiles -p \"fastq_folder_root=$NOVASEQ_BCLCONVERT_ROOT;run_folder_root=$NOVASEQ_ROOT;out_folder=/dataset/hiseq/active/key-files;sample_sheet=$SAMPLE_SHEET;import\" all

OK ? (y/n, default = y)
"
      read_answer_with_default y
      if [ "$answer" != "n" ]; then
         echo "backing up GBS tables " 
         dump_gbs_tables

         echo "running gupdate --explain -t create_gbs_keyfiles -p \"fastq_folder_root=$NOVASEQ_BCLCONVERT_ROOT;run_folder_root=$NOVASEQ_ROOT;out_folder=/dataset/hiseq/active/key-files;sample_sheet=$SAMPLE_SHEET;import\" all"
         gupdate --explain -t create_gbs_keyfiles -p "fastq_folder_root=$NOVASEQ_BCLCONVERT_ROOT;run_folder_root=$NOVASEQ_ROOT;out_folder=/dataset/hiseq/active/key-files;sample_sheet=$SAMPLE_SHEET;import" all
         if [ $? != 0 ]; then
            echo "

sorry - quitting after received bad return code from database import -try looking at the log file shown above

"
            exit 1
         fi
      else
         echo "OK quitting - can't run GBS q/c until run setup has been completed. Most likely problem is an unexpected sample sheet format - after fixing, you can just retry setup with

gupdate --explain -t create_gbs_keyfiles -p \"fastq_folder_root=$NOVASEQ_BCLCONVERT_ROOT;run_folder_root=$NOVASEQ_ROOT;out_folder=/dataset/hiseq/active/key-files;sample_sheet=$SAMPLE_SHEET\" all

"
         exit 1
      fi
   fi


   ####### get and check the analysis type 
   while [ 1 ] ; do
      echo "
please give which q/c analysis you want (clientreport, html, trimmed_kmer_analysis, import_results, all, demultiplex, fasta_demultiplex, kgd, filtered_kgd, kmer_analysis, allkmer_analysis, fasta_sample, fastq_sample, annotation , bwa_mapping, unblind, historical_unblind , common_sequence, unblinded_plots, warehouse, special) (or just press enter to run all)  
(notes:

* entering kgd implies demultiplex will also be run (unless already done); annotation implies also kgd will be 
run (unless already done), etc.)
* entering html, clientreport or warehouse does not rerun any analyses - these just redo the html pages, and update the genophyle gbs tab
* entering import_results does not rerun any analyses - it just (re)imports the results (GBS yields) into the GBS postgres database
* running just kgd does not do unblinding - you will need to then run the unblind step
* but you don't usually need to run unblind stand-alone
* special, includes e.g. slippery slope
"
      read_answer_with_default all
      ANALYSIS=$answer

      if [[ ( "$ANALYSIS" != "demultiplex" ) && ( "$ANALYSIS" != "html" ) && ( "$ANALYSIS" != "trimmed_kmer_analysis" ) && ( "$ANALYSIS" != "clientreport" ) && ( "$ANALYSIS" != "warehouse" ) && ( "$ANALYSIS" != "import_results" ) && ( "$ANALYSIS" != "kgd" ) && ( "$ANALYSIS" != "filtered_kgd" ) &&  ( "$ANALYSIS" != "fasta_demultiplex" ) && ( "$ANALYSIS" != "historical_unblind" ) && ( "$ANALYSIS" != "unblind" ) && ( "$ANALYSIS" != "kmer_analysis" ) && ( "$ANALYSIS" != "allkmer_analysis" ) && ( "$ANALYSIS" != "bwa_mapping" ) && ( "$ANALYSIS" != "fasta_sample" ) && ( "$ANALYSIS" != "annotation" ) && ( "$ANALYSIS" != "fastq_sample" ) && ( "$ANALYSIS" != "all" ) && ( "$ANALYSIS" != "common_sequence" )  && ( "$ANALYSIS" != "unblinded_plots" ) && ( "$ANALYSIS" != "special" ) ]]; then
         echo "analysis must be one of demultiplex, html, trimmed_kmer_analysis, clientreport, warehouse, import_results, kgd, filtered_kgd, fasta_demultiplex, historical_unblind, unblind, kmer_analysis, allkmer_analysis, bwa_mapping , annotation, fasta_sample , fastq_sample , common_sequence , unblinded_plots,special or all"
      else
         break
      fi
   done

   echo "will use analysis=$ANALYSIS
   "

   if [[ ( $ANALYSIS != "html" ) && ( $ANALYSIS != "trimmed_kmer_analysis" ) && ( $ANALYSIS != "import_results" )  && ( $ANALYSIS != "clientreport" ) && ( $ANALYSIS != "warehouse" ) && ( $ANALYSIS != "historical_unblind" ) ]]; then 
      ####### get and check whether to run locally or on the cluster
      echo "

should this run be queued on the compute cluster ? (y/n, default=y. If n, will be run locally)
"
      read_answer_with_default y
      if [ "$answer" != "n" ]; then 
         HPC_TYPE=slurm
      else
         HPC_TYPE=local
      fi
   else
      HPC_TYPE=local
   fi

}


function get_cohort_opts() {
   echo "getting default GBS libraries to process. . .

   "

   LIBRARY_MONIKERS=`gquery -t lab_report -p "name=illumina_run_details;samples"  $RUN | sort -u`
   # e.g. returns
   # SQ1838
   # SQ1839
   # SQ1840

   echo "please enter libraries to run (default = $LIBRARY_MONIKERS)"

   read_answer_with_default $LIBRARY_MONIKERS
   LIBRARY_MONIKERS=$answer

   # check that each library has been imported, in case we are re-running this, e.g. completing a run afer missing
   # keyfiles have been supplied
   to_import=""
   echo "checking keyfiles for libraries have been generated/imported . . . "
   for library_moniker in $LIBRARY_MONIKERS; do
       echo "checking $library_moniker (using gquery -t lab_report -p \"name=illumina_run_details;cohorts;sample_id=$library_moniker\" $RUN ). . ."
       library_cohorts=`gquery -t lab_report -p "name=illumina_run_details;cohorts;sample_id=$library_moniker" $RUN`
       if [ -z "$library_cohorts" ]; then
          echo "looks like $library_moniker needs generating/importing - OK to import ? (y/n, default=y)"
          read_answer_with_default y
          if [ $answer != "y" ]; then
             echo "ok will not generate/import keyfile for $library_moniker"
          else
             to_import="$to_import $library_moniker"
             echo "ok will generate/import keyfile for $library_moniker"
          fi
       fi
   done

   if [ ! -z "to_import" ]; then
       do_extra_library_imports add_keyfile "$to_import"
   fi


   if [[ ( $ANALYSIS != "html" ) && ( $ANALYSIS != "trimmed_kmer_analysis" ) && ( $ANALYSIS != "import_results" )  && ( $ANALYSIS != "trimmed_kmer_analysis" ) && ( $ANALYSIS != "clientreport" )  && ( $ANALYSIS != "warehouse" ) && ( $ANALYSIS != "historical_unblind" ) ]]; then 

      # check if already run - if so prompt for reimport
      to_do=""
      to_import=""
      for library_moniker in $LIBRARY_MONIKERS; do
         # look for landmark file(s)  
         ls $OUTPUT_ROOT/*.${library_moniker}*.demultiplex > $OUTPUT_ROOT/manifest.txt 2>&1
         grep -q "cannot access" $OUTPUT_ROOT/manifest.txt 
            if [ $? != 0 ]; then
             echo "looks like $library_moniker has already been processed (found landmarks )"
             echo "do you want to reprocess this library ? (y/n, default = y)"
             read_answer_with_default y
             if [ $answer != "y" ]; then
                echo "ok skipping $library_moniker"
                continue
             else
                to_do="$to_do $library_moniker"
                echo "ok will reprocess $library_moniker "
                if [[ ( $ANALYSIS != "import_results" ) && ( $ANALYSIS != "html" )  && ( $ANALYSIS != "clientreport" )  && ( $ANALYSIS != "warehouse" ) && ( $ANALYSIS != "historical_unblind" ) ]]; then 
                   echo "do you need the keyfile re-generated/imported ? (y/n, default =y)"
                   read_answer_with_default y
                   if [ $answer != "y" ]; then
                      echo "ok will not re-generate/import keyfile"
                   else
                       to_import="$to_import $library_moniker"
                       echo "ok will re-generate/import keyfile"
                   fi
                fi
             fi
         else
            to_do="$to_do $library_moniker"
         fi
      done
   
      LIBRARY_MONIKERS=$to_do
   fi

   # exit if nothing to do ; run imports if any to do
   if [ -z "$LIBRARY_MONIKERS" ]; then
      echo "quitting , nothing to do"
      exit 1
   fi

   if [ ! -z "$to_import" ]; then
      do_extra_library_imports reimport_library "$to_import"
   fi

   echo "getting default GBS cohorts to process. . .

   "
   gbs_cohorts=""
   for library_moniker in $LIBRARY_MONIKERS; do
       echo "from $library_moniker . . ."
       library_cohorts=`gquery -t lab_report -p "name=illumina_run_details;cohorts;sample_id=$library_moniker" $RUN`
       for library_cohort in $library_cohorts; do
          gbs_cohorts="$gbs_cohorts ${library_moniker}.${library_cohort} "
       done
   done
   echo "please enter gbs cohorts to process (default=$gbs_cohorts)"
   read_answer_with_default $gbs_cohorts
   GBS_COHORTS=$answer


   # optionally clean up  (unless doing summaries etc)
   if [[ ( $ANALYSIS != "html" ) && ( $ANALYSIS != "trimmed_kmer_analysis" ) && ( $ANALYSIS != "import_results" )  && ( $ANALYSIS != "clientreport" ) && ( $ANALYSIS != "warehouse" ) && ( $ANALYSIS != "historical_unblind" )  ]]; then 
      for cohort in $GBS_COHORTS; do
         if [ -d $OUTPUT_ROOT/$cohort ]; then
            echo "found existing results for $cohort in $OUTPUT_ROOT/$cohort - do you want to reset this cohort ? (y/n, default = y)"
            read_answer_with_default y
            if [ $answer != "y" ]; then
               echo "ok not cleaning - note , rerun may be incomplete"
            else
               set -x
               $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f -m $platform -C local -a clean -O $OUTPUT_ROOT -r $RUN $cohort  
               set +x
            fi
         fi
      done
   fi
}


function do_extra_library_imports() {
   # usually library keyfiles are all imported early in the processing, but 
   # sometimes the keyfile was not available, or was wrong, and needs to be 
   # (re) imported later

   import_type=$1
   extra_imports=$2
   if [ ! -z "$extra_imports" ]; then
      echo "about to re-generate/import keyfiles for $extra_imports - ok ? (y/n, default = y)"
      read_answer_with_default y
      if [ $answer != "y" ]; then
         echo "ok quitting"
         exit 1
      fi
      for library in $extra_imports; do
         echo "running gupdate --explain -t create_gbs_keyfiles -p \"fastq_folder_root=$NOVASEQ_BCLCONVERT_ROOT;run_folder_root=$NOVASEQ_ROOT;out_folder=/dataset/hiseq/active/key-files;sample_sheet=$SAMPLE_SHEET;import\" $library "
         echo "press Enter to continue. . ."
         read_answer_with_default ""
         gupdate --explain -t create_gbs_keyfiles -p "fastq_folder_root=$NOVASEQ_BCLCONVERT_ROOT;run_folder_root=$NOVASEQ_ROOT;out_folder=/dataset/hiseq/active/key-files;sample_sheet=$SAMPLE_SHEET;import" $library
         return_code=$?
         if [ $return_code != 0 ]; then
            echo "looks like there was a problem with the import (exit code $return_code)  - check the above log - quitting"
            exit 1
         fi
      done

      echo "*** finished importing keyfiles  ( import type $import_type ) ***"
   fi
}

function get_custom_parameters() {
   while [ 1 ]; do
      echo "if you wish to supply a custom parameters file, enter path to file (or just press Enter to use the 
default file $GBS_PRISM_BIN/nomerge_options_file.txt )

(file contents is keyed by a moniker indicating which module the parameter phrase is for, followed by the phrase. Example:

MergeTaxaTagCount -t n

will result in the -t n option being passed on to the MergeTaxaTagCount utility)
"
      read_answer_with_default n
      if [[ ( "$answer" != "n" ) && ( "$answer" != "N" ) ]]; then
         if [ ! -f $answer ]; then
            continue
         else
            echo "passing on:"
            cat $answer
            break
         fi
      else
         answer=""
         break
      fi
   done
   custom_parameters_file=$answer


   while [ 1 ]; do
      echo "if you wish to supply a custom path to the fastq files or links, enter path (or just press Enter to use the
default path /dataset/hiseq/active/fastq-link-farm )
"
      read_answer_with_default n
      if [ "$answer" != "n" ]; then
         if [ ! -d "$answer" ]; then
            continue
         else
            echo "will use $answer"
            break
         fi
      else
         answer=""
         break
      fi
   done
   custom_fastq_path=$answer
}

function dump_gbs_tables() {
   # ensure we have a dump of the current state of the GBS keyfile table 
   gquery -t sql -p "interface_type=postgres;host=postgres_readonly" "select * from gbskeyfilefact" >  $GBS_BACKUP_DIR/keyfile_dump.dat

   # dump of historical qc_sampleid (generated when a keyfile is *re*imported) 
   gquery -t sql -p "interface_type=postgres;host=postgres_readonly" "select * from gbs_sampleid_history_fact" >  $GBS_BACKUP_DIR/qcsampleid_history.dat

   # dump of the brdf table that has sample-sheet details in it 
   gquery -t sql -p "interface_type=postgres;host=postgres_readonly" "select * from hiseqsamplesheetfact" >  $GBS_BACKUP_DIR/sample_sheet_dump.dat

   # dump of the brdf table which has GBS yield stats (sample depth etc)
   gquery -t sql -p "interface_type=postgres;host=postgres_readonly" "select * from gbsyieldfact" >  $GBS_BACKUP_DIR/yield_dump.dat

   # dump of the brdf model of flowcell x library ( = biosample list x biosample) 
   gquery -t sql -p "interface_type=postgres;host=postgres_readonly" $GBS_PRISM_BIN/runs_libraries_dump.sql >  $GBS_BACKUP_DIR/runs_libraries_dump.dat
}


function run_qc() {

   get_custom_parameters

   custom_parameters_phrase="-p $GBS_PRISM_BIN/nomerge_options_file.txt"
   if [ ! -z "$custom_parameters_file" ]; then
      custom_parameters_phrase="-p $custom_parameters_file"
   fi

   custom_fastq_path_phrase=""
   if [ ! -z "$custom_fastq_path" ]; then
      custom_fastq_path_phrase="-q $custom_fastq_path"
   fi

   echo "

   Ready to run q/c , with the following: 

   $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f  -m $platform -C $HPC_TYPE $custom_parameters_phrase $custom_fastq_path_phrase -a $ANALYSIS -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS 

   "
   echo "OK to continue ? (y/n - default = y)"
   read_answer_with_default y
   if [ $answer != "y" ]; then
      echo "OK quitting"
      exit 1
   else 
      echo "
      running gbs_prism version $gbs_version , KGD version $kgd_version , using : 

      $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f  -m $platform -C $HPC_TYPE $custom_parameters_phrase $custom_fastq_path_phrase -a $ANALYSIS -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS
      " > $OUTPUT_ROOT/run_gbs_qc.log 
      echo "
      export SEQ_PRISMS_BIN=$SEQ_PRISMS_BIN
      export GBS_PRISM_BIN=$GBS_PRISM_BIN 
      $GBS_PRISM_BIN/ag_gbs_qc_prism.sh  -m $platform -C $HPC_TYPE $custom_parameters_phrase $custom_fastq_path_phrase -a $ANALYSIS -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS" > $OUTPUT_ROOT/restart_qc.src 

      echo "
      (if you need to kill this run, you can restart using : 
      source $OUTPUT_ROOT/restart_qc.src

      - once that completes you will also need to rerun this interactive script to generate the html 
      summary page, and again to generate client reports , and again to import the results (if applicable) 
      (choose html , and then clientreport, warehouse and import_results, when prompted for the analysis)


      Logging output is captured in the following files: 

      * $GBS_PRISM_BIN/ag_gbs_qc_prism.sh logs to $OUTPUT_ROOT/run_gbs_qc.log

      "
    
      $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f  -m $platform -C $HPC_TYPE $custom_parameters_phrase $custom_fastq_path_phrase -a $ANALYSIS -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS  >> $OUTPUT_ROOT/run_gbs_qc.log 2>&1
      return_code=$?

      if [ $return_code != 0 ]; then
         echo "

>>>> ( looks like there was a problem with ag_gbs_qc_prism.sh (non-zero return code $return_code ) - check $OUTPUT_ROOT/run_gbs_qc.log  ) 
"
      else
          echo "** run_qc looks ok **"
      fi
   fi
}

function import_results() {
   mkdir -p $OUTPUT_ROOT/html 

   if [ $return_code != 0 ]; then
      echo "

looks like there may have been a problem with the q/c (non-zero exit code from ag_gbs_qc_prism.sh )  - 
suggest you check  $OUTPUT_ROOT/run_gbs_qc.log. You can manually run the import later using : 

$GBS_PRISM_BIN/import_hiseq_reads_tags_cv.sh -r $RUN
$GBS_PRISM_BIN/import_kgd_stats.sh -r $RUN

"
      exit 1
   else
      echo "

looks like q/c completed OK

Ready to summarise and import yield stats using :

$GBS_PRISM_BIN/import_hiseq_reads_tags_cv.sh -r $RUN
$GBS_PRISM_BIN/import_kgd_stats.sh -r $RUN

OK to continue ? (y/n , default = y)"

      read_answer_with_default y
      if [ $answer != "y" ]; then
         echo "OK not importing results"
      else
         echo "
      running

$GBS_PRISM_BIN/import_hiseq_reads_tags_cv.sh -r $RUN
$GBS_PRISM_BIN/import_kgd_stats.sh -r $RUN

      " >> $OUTPUT_ROOT/run_gbs_qc.log
         rm -f $OUTPUT_ROOT/database_results_import.log
         $GBS_PRISM_BIN/import_hiseq_reads_tags_cv.sh -r $RUN >> $OUTPUT_ROOT/database_results_import.log 2>&1
         return_code1=$?
         if [ $return_code1 == 0 ]; then
            $GBS_PRISM_BIN/import_kgd_stats.sh -r $RUN >> $OUTPUT_ROOT/database_results_import.log 2>&1
            return_code2=$?
         else
            return_code2=1
         fi

         if [[ ( $return_code1 != 0 ) || ( $return_code2 != 0 ) ]]; then
            echo "

** looks like there was a problem with one of the import(non-zero return code) - check $OUTPUT_ROOT/database_results_import.log  **
( you can run manually using : 

$GBS_PRISM_BIN/import_hiseq_reads_tags_cv.sh -r $RUN
$GBS_PRISM_BIN/import_kgd_stats.sh -r $RUN
)

"
            exit 1
         else
             echo "** database import looks ok **"
         fi
      fi
   fi
}

function generate_trimmed_kmer_analysis() {
   mkdir -p $OUTPUT_ROOT/trimmed_kmer_analysis

   if [ ! -d $OUTPUT_ROOT/bwa_mapping ]; then
      echo "*** unable to do trimmed_kmer_analysis as no trimmed fastq available (do a bwa_mapping run first) ***"
      return
   fi

   echo "

   Ready to generate trimmed kmer analysis , with the following:

   $GBS_PRISM_BIN/ag_gbs_qc_prism.sh  -m $platform -C $HPC_TYPE -f -a trimmed_kmer_analysis -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS

   "
   echo "OK to run that  ? (y/n , default = y)"
   read_answer_with_default y
   if [ $answer != "y" ]; then
      echo "OK will not run trimmed kmer analysis"
   else
      echo "
      running

       $GBS_PRISM_BIN/ag_gbs_qc_prism.sh  -m $platform -C $HPC_TYPE -f -a trimmed_kmer_analysis -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS 

      " >> $OUTPUT_ROOT/generate_trimmed_kmer_analysis.log
      $GBS_PRISM_BIN/ag_gbs_qc_prism.sh  -m $platform -C $HPC_TYPE -f -a trimmed_kmer_analysis -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS  > $OUTPUT_ROOT/generate_trimmed_kmer_analysis.log 2>&1 
      return_code=$?
      if [ $return_code != 0 ]; then
         echo "

** looks like there was a problem with generating trimmed kmer analysis (non-zero return code ) - check $OUTPUT_ROOT/generate_trimmed_kmer_analysis.log  **

"
         exit 1
      else
          echo "** trimmed kmer analysis looks ok **"
      fi
   fi
}

function generate_unblinded_plots() {
   echo "

   Ready to attempt unblinded plots , with the following:

   $GBS_PRISM_BIN/ag_gbs_qc_prism.sh  -m $platform -C $HPC_TYPE -f -a unblinded_plots -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS

   "
   echo "OK to run that  ? (y/n , default = y)"
   read_answer_with_default y
   if [ $answer != "y" ]; then
      echo "OK will not run unblinded plots"
   else
      echo "
      running

       $GBS_PRISM_BIN/ag_gbs_qc_prism.sh  -m $platform -C $HPC_TYPE -f -a unblinded_plots -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS

      " >> $OUTPUT_ROOT/generate_unblinded_plots.log
      $GBS_PRISM_BIN/ag_gbs_qc_prism.sh  -m $platform -C $HPC_TYPE -f -a unblinded_plots -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS  > $OUTPUT_ROOT/generate_unblinded_plots.log 2>&1
      return_code=$?
      if [ $return_code != 0 ]; then
         echo "

** looks like there was a problem with generating generate_unblinded_plots (non-zero return code ) - check $OUTPUT_ROOT/generate_unblinded_plots.log  **

"
         exit 1
      else
          echo "** unblinded plot run looks ok **"
      fi
   fi
}


function generate_html() {
   mkdir -p $OUTPUT_ROOT/html

   echo "

   Ready to generate html summaries  , with the following:

   $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f  -m $platform -C local -a html -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS 

   "
   echo "OK to continue ? (y/n , default = y)"
   read_answer_with_default y
   if [ $answer != "y" ]; then
      echo "OK quitting"
      exit 1
   else
      echo "
      running

      $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f  -m $platform -C local -a html -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS

      " >> $OUTPUT_ROOT/run_gbs_qc.log
      $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f  -m $platform -C local -a html -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS > $OUTPUT_ROOT/generate_html.log 2>&1
      return_code=$?
      if [ $return_code != 0 ]; then
         echo "

** looks like there was a problem with generating html (non-zero return code ) - check $OUTPUT_ROOT/generate_html.log  **
( you can run manually using : $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f  -m $platform -C local -a html -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS )

"
         exit 1
      else
          echo "** html looks ok **"
      fi
   fi

}

function generate_special() {
   echo ""
   # used to build a mash up of all the self-relatedness plots - this deprecated 
   #echo "do you want to (re)generate the slippery slope plot ? (y/n default y)"
   #read_answer_with_default n
   #if [ $answer == "y" ]; then
   #   set -x
   #   $GBS_PRISM_BIN/SelfRelDepth.sh -I
   #   set +x
   #fi
}

function generate_clientreport() {

   echo "

   Ready to generate clientreport summaries  , with the following:

   $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f  -m $platform -C local -a clientreport -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS

   "
   echo "OK to continue ? (y/n , default = y)"
   read_answer_with_default y
   if [ $answer != "y" ]; then
      echo "OK will not generate clientreports "
   else
      echo "
      running

      $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f  -m $platform -C local -a clientreport -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS

      " >> $OUTPUT_ROOT/run_gbs_qc.log
      $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f  -m $platform -C local -a clientreport -O $OUTPUT_ROOT -r $RUN $GBS_COHORTS > $OUTPUT_ROOT/generate_clientreport.log 2>&1
      return_code=$?
      if [ $return_code != 0 ]; then
         echo "

** looks like there was a problem with generating clientreports (non-zero return code ) - check $OUTPUT_ROOT/generate_clientreport.log  **
( you can run manually using : $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f  -m $platform -C local -a clientreport -O $OUTPUT_ROOT -r $RUN )

"
         exit 1
      else
          echo "** clientreport looks ok **"
      fi
   fi
}

function update_warehouse() {

   echo "

   Ready to update the genophyle gbs tab, with the following:

   $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f  -m $platform -C local -a warehouse -O $OUTPUT_ROOT -r $RUN 

   "
   echo "OK to continue ? (y/n , default = y)"
   read_answer_with_default y
   if [ $answer != "y" ]; then
      echo "OK will not update genophyle gbs tab "
   else
      echo "
      running

      $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f  -m $platform -C local -a warehouse -O $OUTPUT_ROOT -r $RUN 

      " >> $OUTPUT_ROOT/run_gbs_qc.log
      $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f  -m $platform -C local -a warehouse -O $OUTPUT_ROOT -r $RUN  > $OUTPUT_ROOT/run_warehouse_update.log 2>&1
      return_code=$?
      if [ $return_code != 0 ]; then
         echo "

** looks like there was a problem updating the genophyle GBS tab (non-zero return code ) - check $OUTPUT_ROOT/update_warehouse.log  **
( you can run manually using : $GBS_PRISM_BIN/ag_gbs_qc_prism.sh -f  -m $platform -C local -a warehouse -O $OUTPUT_ROOT -r $RUN )

"
         exit 1
      else
          echo "** warehouse update looks ok **"
      fi
   fi
}

get_opts "$@"
get_run_opts
get_cohort_opts

if [ $ANALYSIS == "trimmed_kmer_analysis" ]; then
   generate_trimmed_kmer_analysis
fi

if [ $ANALYSIS == "unblinded_plots" ]; then
   generate_unblinded_plots
fi

if [[ ( $ANALYSIS != "import_results" ) && ( $ANALYSIS != "html" )  && ( $ANALYSIS != "clientreport" ) && ( $ANALYSIS != "warehouse" ) && ( $ANALYSIS != "trimmed_kmer_analysis" ) && ( $ANALYSIS != "unblinded_plots" ) ]]; then 
   run_qc
   generate_trimmed_kmer_analysis
   generate_unblinded_plots
fi

if [[ ( $ANALYSIS != "import_results" )  && ( $ANALYSIS != "clientreport" )  && ( $ANALYSIS != "warehouse" ) ]]; then
   generate_html
fi

if [[ ( $ANALYSIS != "html" )  && ( $ANALYSIS != "clientreport" )  && ( $ANALYSIS != "warehouse" )  && ( $ANALYSIS != "trimmed_kmer_analysis" ) && ( $ANALYSIS != "unblinded_plots" )  ]]; then
   return_code=0
   import_results
fi

if [[ ( $ANALYSIS != "import_results" )  && ( $ANALYSIS != "html" ) && ( $ANALYSIS != "warehouse" ) ]]; then
   generate_clientreport
fi

if [[ ( $ANALYSIS != "import_results" )  && ( $ANALYSIS != "html" ) && ( $ANALYSIS != "clientreport" ) ]]; then
   update_warehouse
fi


if [[ ( $ANALYSIS != "import_results" ) && ( $ANALYSIS != "html" )  && ( $ANALYSIS != "clientreport" )  && ( $ANALYSIS != "warehouse" ) && ( $ANALYSIS != "trimmed_kmer_analysis" ) && ( $ANALYSIS != "unblinded_plots" ) ]]; then
   generate_special
fi