Makefile.config

## The following global variables are available to dependent pipelines
## and furthermore can be overridden during individual calls to make:
## `make CONFIG_DIR=/path/to/alternate/configs
## Note that this will have confusing behaviors if you don't know what
## you're doing, so please be careful.


#### Variables you likely need to set for your own project

## installation directory for your project. can be manipulated if you know
## what you're doing
PROJECT_BASE_DIR := /CGF/GWAS/Scans/PLCO/builds/1/plco-analysis
PLCO_INSTALL_PREFIX := /CGF/GWAS/Scans/PLCO/builds/1/plco-analysis
## names of chips that should be processed for the study. these can be
## arbitrary names; however: underscores ("_") are used to determine batches
## for platforms split up into multiple imputations, so do not make up chip
## names with underscores
PLATFORMS := OmniX Omni25 Omni5 Oncoarray $(patsubst %,GSA_batch%,$(shell seq 1 5))
## location of chip freeze files; if you have access to the relevant spaces on cgens,
## you can use those; otherwise, set this to where you've placed your chip files from
## your DUPS request or similar other data download
CHIP_FREEZE_INPUT_DIR := $(PLCO_INSTALL_PREFIX)/../current-chip-final-subjects
## location of cross-platform duplicate resolution file, by default named
## 'PLCO_final_subject_list_Ancestry_UniqGenotypePlatform_04132020.txt'
EXTERNAL_FILE_INPUT_DIR := $(PLCO_INSTALL_PREFIX)/external-files
## location and name of duplicates file 
UNIQUE_SUBJECT_LIST := $(EXTERNAL_FILE_INPUT_DIR)/PLCO_final_subject_list_Ancestry_UniqGenotypePlatform_04132020.txt
## location of imputed data, after Rsq filtering and duplicate subject removal
FILTERED_IMPUTED_INPUT_DIR := $(PLCO_INSTALL_PREFIX)/../freeze2-imputation/raw-mis-nonoverlapping-subjects
## location of configuration files for analysis models
CONFIG_INPUT_DIR := $(PROJECT_BASE_DIR)/config
## location of 1000 Genomes reference files. this is *optional*; there is a pipeline
## that will automatically download these files for you if you want that, but you can
## instead point to a local directory containing the 1KG files if you don't want
## to download a redundant copy
KG_REFERENCE_INPUT_DIR := $(PROJECT_BASE_DIR)/1KG_files
## 1000 Genomes reference data information for ldscores for use with BOLT
## if you want to use local files, make sure they match the format specified here
## (ALL.chr#.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz)
KG_GENOTYPES_PREFIX := $(KG_REFERENCE_INPUT_DIR)/ALL.chr
KG_GENOTYPES_SUFFIX := .shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz
KG_MANIFEST := 20181203_biallelic_SNV_manifest.txt
## this variable is only required if you want the pipeline to download the 1KG data for you
KG_DOWNLOAD_SITE := ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV
## location of known signals for pretty plotting. this is *optional* and only
## used in shared-makefiles/Makefile.plotting
KNOWN_SIGNALS_INPUT_DIR := $(PROJECT_BASE_DIR)/known-signals
## location of phenotype data used for all analyses
## format is: plain text, tab-delimited, header row with variable names, single column with subject IDs
## note that this can be overridden (make PHENOTYPE_FILENAME=/path/to/different/file saige) to run
## analyses using custom phenotypes, skipping the standard atlas-style processing mechanism
PHENOTYPE_FILENAME := $(PLCO_INSTALL_PREFIX)/phenotypes/v10/atlas_v10.with_na.augmented.02nov2020.tsv
## header name in phenotype file for ID column
PHENOTYPE_ID_COLNAME := plco_id


#### Variables you can change but don't have to for successful run

## pipeline and output location for bgen imputed data reformatting pipeline results
BGEN_OUTPUT_DIR := $(PROJECT_BASE_DIR)/bgen
## pipeline and output location for chip QC pipeline results
CLEANED_CHIP_OUTPUT_DIR := $(PROJECT_BASE_DIR)/cleaned-chips-by-ancestry
## pipeline and output location for relatedness pipeline results
RELATEDNESS_OUTPUT_DIR := $(PROJECT_BASE_DIR)/relatedness
## pipeline and output location for ancestry pipeline results
ANCESTRY_OUTPUT_DIR := $(PROJECT_BASE_DIR)/ancestry
## pipeline and output location for ldsc pipeline results
LDSC_OUTPUT_DIR := $(PROJECT_BASE_DIR)/ldsc
## output location for fastGWA GRM results files
## note the pipeline itself is under shared-makefiles
FASTGWA_GRM_OUTPUT_DIR := $(PROJECT_BASE_DIR)/fastgwa-grm
## output location for all primary and meta-analysis
## this is one of a few very likely override targets, if you
## want to run some analyses that you don't want to conflict with
## an existing analysis output path
RESULT_OUTPUT_DIR := $(PROJECT_BASE_DIR)/results
## output location for globus results, for data distribution to CBIIT
## or potentially future collaborators
GLOBUS_OUTPUT_DIR := $(PROJECT_BASE_DIR)/globus
## pipeline location for analysis module pipelines, meta-analysis, plotting, ldscore regression, globus.
## basically, any pipeline that needs to run for each analysis config file
SHARED_MAKEFILES := $(PROJECT_BASE_DIR)/shared-makefiles
## utility script location. most of the former contents of this directory have been
## migrated out to separately maintained packages that are pulled in with conda
SHARED_SOURCE := $(PROJECT_BASE_DIR)/shared-source
## chromosomes to analyze
CHRS := $(shell seq 1 22)


#### Supported software variables; shouldn't need to be redefined if using the
#### conda environment specification provided with the pipeline

## default memory allocation for plink, in MB
PLINK_MEMORY_LIMIT := 16000
## plink v1.9 call
PLINK19 := plink --memory $(PLINK_MEMORY_LIMIT)
## plink v2 call
PLINK2 := plink2 --memory $(PLINK_MEMORY_LIMIT)
## UCSC liftOver call
LIFTOVER_EXECUTABLE := liftOver
## path to and location of hg19->hg38 liftOver chain for chip conversion
LIFTOVER_19_TO_38 := $(CONDA_PREFIX)/share/liftover-chains/hg19ToHg38.over.chain.gz
## bgenix call
BGENIX := bgenix
## bolt-lmm call
BOLTLMM_EXECUTABLE := bolt
## hg38 genetic map, provided by bolt under tables/
GRCH38_RECOMBINATION_MAP := $(CONDA_PREFIX)/share/bolt-lmm/tables/genetic_map_hg38_withX.txt.gz
## eigensoft smartpca call
SMARTPCA := smartpca
## number of principal components to be generated by smartpca
## however many you set here, this is the number (as PC1 - PCN)
## automatically available to specify as analysis config covariates
SMARTPCA_N_PCS := 20
## ldsc python scripts
LDSC_PY := ldsc.py
MUNGE_SUMSTATS_PY := munge_sumstats.py
## internal script for annotating results files with reference frequencies
ANNOTATE_FREQUENCY := $(CONDA_PREFIX)/bin/annotate_frequency.out
## reference frequency files, by supercontinent
## file of id chr pos a0 a1
TOPMED_FREQUENCY_METADATA := $(PROJECT_BASE_DIR)/annotations/freq.prefix.tsv.bz2
## path and prefix of frequency data by supercontinent
TOPMED_FREQUENCY_FILE_PREFIX := $(PROJECT_BASE_DIR)/annotations/freq.
## suffix of frequency data by supercontinent
TOPMED_FREQUENCY_FILE_SUFFIX := .tsv.bz2
## R script for deploying R::construct.model.matrix in analysis pipelines
CONSTRUCT_MODEL_MATRIX := $(SHARED_SOURCE)/construct_model_matrix.R
## internal script for annotating results files with rsIDs (instead of chr:pos IDs from TOPMed)
ANNOTATE_RSID := $(CONDA_PREFIX)/bin/annotate_rsids_from_linker.out
## linker file for chr:pos:ref:alt IDs to rsIDs
RSID_LINKER_FILE := $(PROJECT_BASE_DIR)/annotations/rsid_linker.tsv.bz2
## internal script for merging the results of split saige runs for categorical (N>2) variables
COMBINE_CATEGORICAL_RUNS := $(CONDA_PREFIX)/bin/combine_categorical_runs.out
## R script for launching gwasqcplots manhattan/qq plotter
MANHATTAN_LAUNCHER := $(SHARED_SOURCE)/manhattan_loci_colored_launcher.R
## utility python function for dealing with output paths in globus pipeline
FORMAT_GLOBUS_PREFIXES := $(SHARED_SOURCE)/format_globus_prefixes.py
## internal script for creating association results directories and providing their prefixes to the make DAG
INITIALIZE_OUTPUT_DIRECTORIES := $(CONDA_PREFIX)/bin/initialize_output_directories.out
## yaml configuration for tracking file extensions; processed into Make variable space at end of this file
EXTENSION_CONFIG := $(CONDA_PREFIX)/share/initialize_output_directories/extensions.config.yaml
## simple R script for manipulating categorical data
PARSE_CATEGORICAL_OUTPUT := $(SHARED_SOURCE)/parse_categorical_output.R
## deal with data source for bolt ldscores files
RESOLVE_BOLT_LDSCORE := $(SHARED_SOURCE)/resolve_bolt_ldscore.bash
## R script for deploying the first saige round, relatedness estimation 
SAIGE_ROUND_1_SCRIPT := $(SHARED_SOURCE)/saige_round1.R
## saige cutoff parameter for degree of relatedness
SAIGE_ROUND_1_RELATEDNESS_CUTOFF := 0.05
## R script for deploying the second saige round, association
SAIGE_ROUND_2_SCRIPT := $(SHARED_SOURCE)/saige_round2.R
## graf call (relatedness)
GRAF_EXECUTABLE := graf
## graf PlotPopulations perl call
## note that the variant of graf in conda has a patched shebang, such that it can
## be called as an executable correctly
GRAF_POP := PlotPopulations.pl
## location of graf internal reference bim file, for rsID mapping
GRAF_1KG_VARIANT_BIMFILE := $(CONDA_PREFIX)/share/graf/G1000FpGeno.bim
## sanitized graf ancestry groups, with " " values replaced by underscores
CLEANED_ANCESTRY_NAMES := European East_Asian Other South_Asian African_American Hispanic1 Hispanic2
## metal meta-analysis call
METAL_EXECUTABLE := metal
## internal script for merging files into CBIIT-approved globus distribution format
MERGE_FILES_FOR_GLOBUS := $(CONDA_PREFIX)/bin/merge_files_for_globus.out
## accepted analysis tools, which should have shared-makefiles/Makefile.NAME available
SUPPORTED_METHODS := saige boltlmm fastgwa

## software subject count restrictions; extremely empirical and subject to change
FASTGWA_MINIMUM_VALID_SUBJECT_COUNT := 2000
BOLTLMM_MINIMUM_VALID_SUBJECT_COUNT := 3000
SAIGE_MINIMUM_VALID_SUBJECT_COUNT := 1001
## for fastGWA, a number of threads to use during run
N_THREADS := 2
## for case/control comparisons (saige), minimum number of available cases
## in a given platform comparison for the analysis to be run at all
MINIMUM_VALID_CASE_COUNT := 30

## chip cleaning parameters
## plink heterozygosity F statistic max value (absolute)
HET_F_MAX := 0.2
## plink IBS/IBD pi_hat minimum value for reporting
PIHAT_MIN := 0.05
## minimum number of samples in a chip/ancestry combination permitted
SAMPLE_SIZE_MIN := 50
## cutoff number for switching between oneshot and parallel runs of plink --genome
SAMPLE_SIZE_LARGE := 10000
## when running plink --genome --parallel, how many jobs to parallelize into
## may need to be increased down the line
GENOME_GZ_JOB_SPLIT := 20

## LDSCORES estimation parameters, for ldsc/bolt backend
LDSC_MAF_THRESHOLD := 0.005


#### Make infrastructure. Should only be edited by an expert.

## define tracking success file suffix
TRACKING_SUCCESS_SUFFIX := .success
## define tracking fail file suffix
TRACKING_FAIL_SUFFIX := .fail


## cluster queue settings
## these are by default set to things that work for cgems/ccad/sge
## if you're on a different system, you'll have to update these
## as well as the handler itself.

## internal script for compatibility between make and qsub/sge
QSUB_JOB_MONITOR := $(CONDA_PREFIX)/bin/qsub_job_monitor.out
## internal script for compatibility between make and sbatch/slurm
SBATCH_JOB_MONITOR := $(CONDA_PREFIX)/bin/sbatch_job_monitor.out
## theoretically there should eventually be more interface programs for other clusters;
## set which one is used here
ACTIVE_JOB_MONITOR := $(QSUB_JOB_MONITOR)

## queue and resource default settings
## note for sbatch you'll want to add memory and cpu settings most likely?
NORMAL_QUEUE := all.q
## NORMAL_QUEUE := norm
NORMAL_TIME := h_rt=23:45:00
LONG_QUEUE := long.q
## LONG_QUEUE := norm
LONG_TIME := h_rt=71:45:00
HUGE_QUEUE := bigmem.q
## HUGE_QUEUE := norm
SHORT_TIME := h_rt=4:30:00
HOUR_TIME := h_rt=1:00:00


## job submission with simple defaults
define sub_handler
echo -e "$(subst $$,\$$,$(subst ",\",$(subst \,\\\\,$(2)))) \nif [[ \"\$$?\" -eq \"0\" ]] ; then \n\ttouch $(1)$(TRACKING_SUCCESS_SUFFIX) && exit 0\nelse\n\ttouch $(1)$(TRACKING_FAIL_SUFFIX) && exit 1\nfi\n" > $(1).command.bash ; \
$(ACTIVE_JOB_MONITOR) -o $(1) -r "$(NORMAL_TIME)" -q $(NORMAL_QUEUE) -c $(1).command.bash -t 10
endef

## job submission for a very long run
define sub_handler_long
echo -e "$(subst $$,\$$,$(subst ",\",$(subst \,\\\\,$(2)))) \nif [[ \"\$$?\" -eq \"0\" ]] ; then \n\ttouch $(1)$(TRACKING_SUCCESS_SUFFIX) && exit 0\nelse\n\ttouch $(1)$(TRACKING_FAIL_SUFFIX) && exit 1\nfi\n" > $(1).command.bash ; \
$(ACTIVE_JOB_MONITOR) -o $(1) -r "$(LONG_TIME)" -q $(LONG_QUEUE) -c $(1).command.bash -t 10
endef

## job submission expecting customizable parameters
define sub_handler_specify_queue_time
echo -e "$(subst $$,\$$,$(subst ",\",$(subst \,\\\\,$(2)))) \nif [[ \"\$$?\" -eq \"0\" ]] ; then \n\ttouch $(1)$(TRACKING_SUCCESS_SUFFIX) && exit 0\nelse\n\ttouch $(1)$(TRACKING_FAIL_SUFFIX) && exit 1\nfi\n" > $(1).command.bash ; \
$(ACTIVE_JOB_MONITOR) -o $(1) -r "$(4)" -q $(3) -c $(1).command.bash -t 10
endef


## log handling without cluster submission
define log_handler
echo -e "$(subst $$,\$$,$(subst ",\",$(subst \,\\\\,$(2)))) \nif [[ \"\$$?\" -eq \"0\" ]] ; then \n\ttouch $(1)$(TRACKING_SUCCESS_SUFFIX) && exit 0\nelse\n\ttouch $(1)$(TRACKING_FAIL_SUFFIX) && exit 1\nfi\n" > $(1).command.bash ; \
rm -f $(1)$(TRACKING_SUCCESS_SUFFIX) $(1)$(TRACKING_FAIL_SUFFIX) ; \
bash $(1).command.bash > $(1).output 2> $(1).error
endef


## convert GRAF-style harmonized ancestries to supercontinent; simplified for now
define resolve_ancestry
$(if $(filter European,$(word 2,$(subst /, ,$(subst $(RESULT_OUTPUT_DIR),,$(1))))),EUR,EAS)
endef

## utility python script to get extensions from yaml with pyyaml
## probably replace this with a prettier solution when you can
GET_SINGLE_YAML_ENTRY := $(SHARED_SOURCE)/get_single_yaml_entry.py

## frequency mode valid options
SUBJECT_MODE := subject
REFERENCE_MODE := reference

## id mode valid options
RSID_MODE := rsid
CHRPOS_MODE := chrpos

## standardized suffixes for tracking files
## below calls only work if python3 is present;
## so these suffixes will be undefined in the `plco-analysis-ldsc` environment.
## but ldsc/Makefile and shared-makefiles/Makefile.ldscores don't use them, so that's ok for now
PYTHON3_PRESENT := $(filter 0,$(shell command -v python3 > /dev/null 2>&1 && echo $$?))
PHENOTYPE_DATASET_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) phenotype-dataset),)
PHENOTYPE_NAME_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) phenotype),)
COVARIATE_NAME_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) covariates),)
TRANSFORMATION_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) general-extensions transformation suffix),)
SEX_SPECIFIC_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) general-extensions sex-specific suffix),)
CONTROL_INCLUSION_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) general-extensions control_inclusion suffix),)
CONTROL_EXCLUSION_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) general-extensions control_exclusion suffix),)
FINALIZATION_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) finalization),)
FREQUENCY_MODE_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) general-extensions frequency_mode suffix),)
ID_MODE_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) general-extensions id_mode suffix),)
CATEGORY_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) categories),)