forked from NCI-CGR/plco-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile.config
298 lines (260 loc) · 15.6 KB
/
Makefile.config
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
## The following global variables are available to dependent pipelines
## and furthermore can be overridden during individual calls to make:
## `make CONFIG_DIR=/path/to/alternate/configs
## Note that this will have confusing behaviors if you don't know what
## you're doing, so please be careful.
#### Variables you likely need to set for your own project
## installation directory for your project. can be manipulated if you know
## what you're doing
PROJECT_BASE_DIR := /CGF/GWAS/Scans/PLCO/builds/1/plco-analysis
PLCO_INSTALL_PREFIX := /CGF/GWAS/Scans/PLCO/builds/1/plco-analysis
## names of chips that should be processed for the study. these can be
## arbitrary names; however: underscores ("_") are used to determine batches
## for platforms split up into multiple imputations, so do not make up chip
## names with underscores
PLATFORMS := OmniX Omni25 Omni5 Oncoarray $(patsubst %,GSA_batch%,$(shell seq 1 5))
## location of chip freeze files; if you have access to the relevant spaces on cgens,
## you can use those; otherwise, set this to where you've placed your chip files from
## your DUPS request or similar other data download
CHIP_FREEZE_INPUT_DIR := $(PLCO_INSTALL_PREFIX)/../current-chip-final-subjects
## location of cross-platform duplicate resolution file, by default named
## 'PLCO_final_subject_list_Ancestry_UniqGenotypePlatform_04132020.txt'
EXTERNAL_FILE_INPUT_DIR := $(PLCO_INSTALL_PREFIX)/external-files
## location and name of duplicates file
UNIQUE_SUBJECT_LIST := $(EXTERNAL_FILE_INPUT_DIR)/PLCO_final_subject_list_Ancestry_UniqGenotypePlatform_04132020.txt
## location of imputed data, after Rsq filtering and duplicate subject removal
FILTERED_IMPUTED_INPUT_DIR := $(PLCO_INSTALL_PREFIX)/../freeze2-imputation/raw-mis-nonoverlapping-subjects
## location of configuration files for analysis models
CONFIG_INPUT_DIR := $(PROJECT_BASE_DIR)/config
## location of 1000 Genomes reference files. this is *optional*; there is a pipeline
## that will automatically download these files for you if you want that, but you can
## instead point to a local directory containing the 1KG files if you don't want
## to download a redundant copy
KG_REFERENCE_INPUT_DIR := $(PROJECT_BASE_DIR)/1KG_files
## 1000 Genomes reference data information for ldscores for use with BOLT
## if you want to use local files, make sure they match the format specified here
## (ALL.chr#.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz)
KG_GENOTYPES_PREFIX := $(KG_REFERENCE_INPUT_DIR)/ALL.chr
KG_GENOTYPES_SUFFIX := .shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz
KG_MANIFEST := 20181203_biallelic_SNV_manifest.txt
## this variable is only required if you want the pipeline to download the 1KG data for you
KG_DOWNLOAD_SITE := ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV
## location of known signals for pretty plotting. this is *optional* and only
## used in shared-makefiles/Makefile.plotting
KNOWN_SIGNALS_INPUT_DIR := $(PROJECT_BASE_DIR)/known-signals
## location of phenotype data used for all analyses
## format is: plain text, tab-delimited, header row with variable names, single column with subject IDs
## note that this can be overridden (make PHENOTYPE_FILENAME=/path/to/different/file saige) to run
## analyses using custom phenotypes, skipping the standard atlas-style processing mechanism
PHENOTYPE_FILENAME := $(PLCO_INSTALL_PREFIX)/phenotypes/v10/atlas_v10.with_na.augmented.02nov2020.tsv
## header name in phenotype file for ID column
PHENOTYPE_ID_COLNAME := plco_id
#### Variables you can change but don't have to for successful run
## pipeline and output location for bgen imputed data reformatting pipeline results
BGEN_OUTPUT_DIR := $(PROJECT_BASE_DIR)/bgen
## pipeline and output location for chip QC pipeline results
CLEANED_CHIP_OUTPUT_DIR := $(PROJECT_BASE_DIR)/cleaned-chips-by-ancestry
## pipeline and output location for relatedness pipeline results
RELATEDNESS_OUTPUT_DIR := $(PROJECT_BASE_DIR)/relatedness
## pipeline and output location for ancestry pipeline results
ANCESTRY_OUTPUT_DIR := $(PROJECT_BASE_DIR)/ancestry
## pipeline and output location for ldsc pipeline results
LDSC_OUTPUT_DIR := $(PROJECT_BASE_DIR)/ldsc
## output location for fastGWA GRM results files
## note the pipeline itself is under shared-makefiles
FASTGWA_GRM_OUTPUT_DIR := $(PROJECT_BASE_DIR)/fastgwa-grm
## output location for all primary and meta-analysis
## this is one of a few very likely override targets, if you
## want to run some analyses that you don't want to conflict with
## an existing analysis output path
RESULT_OUTPUT_DIR := $(PROJECT_BASE_DIR)/results
## output location for globus results, for data distribution to CBIIT
## or potentially future collaborators
GLOBUS_OUTPUT_DIR := $(PROJECT_BASE_DIR)/globus
## pipeline location for analysis module pipelines, meta-analysis, plotting, ldscore regression, globus.
## basically, any pipeline that needs to run for each analysis config file
SHARED_MAKEFILES := $(PROJECT_BASE_DIR)/shared-makefiles
## utility script location. most of the former contents of this directory have been
## migrated out to separately maintained packages that are pulled in with conda
SHARED_SOURCE := $(PROJECT_BASE_DIR)/shared-source
## chromosomes to analyze
CHRS := $(shell seq 1 22)
#### Supported software variables; shouldn't need to be redefined if using the
#### conda environment specification provided with the pipeline
## default memory allocation for plink, in MB
PLINK_MEMORY_LIMIT := 16000
## plink v1.9 call
PLINK19 := plink --memory $(PLINK_MEMORY_LIMIT)
## plink v2 call
PLINK2 := plink2 --memory $(PLINK_MEMORY_LIMIT)
## UCSC liftOver call
LIFTOVER_EXECUTABLE := liftOver
## path to and location of hg19->hg38 liftOver chain for chip conversion
LIFTOVER_19_TO_38 := $(CONDA_PREFIX)/share/liftover-chains/hg19ToHg38.over.chain.gz
## bgenix call
BGENIX := bgenix
## bolt-lmm call
BOLTLMM_EXECUTABLE := bolt
## hg38 genetic map, provided by bolt under tables/
GRCH38_RECOMBINATION_MAP := $(CONDA_PREFIX)/share/bolt-lmm/tables/genetic_map_hg38_withX.txt.gz
## eigensoft smartpca call
SMARTPCA := smartpca
## number of principal components to be generated by smartpca
## however many you set here, this is the number (as PC1 - PCN)
## automatically available to specify as analysis config covariates
SMARTPCA_N_PCS := 20
## ldsc python scripts
LDSC_PY := ldsc.py
MUNGE_SUMSTATS_PY := munge_sumstats.py
## internal script for annotating results files with reference frequencies
ANNOTATE_FREQUENCY := $(CONDA_PREFIX)/bin/annotate_frequency.out
## reference frequency files, by supercontinent
## file of id chr pos a0 a1
TOPMED_FREQUENCY_METADATA := $(PROJECT_BASE_DIR)/annotations/freq.prefix.tsv.bz2
## path and prefix of frequency data by supercontinent
TOPMED_FREQUENCY_FILE_PREFIX := $(PROJECT_BASE_DIR)/annotations/freq.
## suffix of frequency data by supercontinent
TOPMED_FREQUENCY_FILE_SUFFIX := .tsv.bz2
## R script for deploying R::construct.model.matrix in analysis pipelines
CONSTRUCT_MODEL_MATRIX := $(SHARED_SOURCE)/construct_model_matrix.R
## internal script for annotating results files with rsIDs (instead of chr:pos IDs from TOPMed)
ANNOTATE_RSID := $(CONDA_PREFIX)/bin/annotate_rsids_from_linker.out
## linker file for chr:pos:ref:alt IDs to rsIDs
RSID_LINKER_FILE := $(PROJECT_BASE_DIR)/annotations/rsid_linker.tsv.bz2
## internal script for merging the results of split saige runs for categorical (N>2) variables
COMBINE_CATEGORICAL_RUNS := $(CONDA_PREFIX)/bin/combine_categorical_runs.out
## R script for launching gwasqcplots manhattan/qq plotter
MANHATTAN_LAUNCHER := $(SHARED_SOURCE)/manhattan_loci_colored_launcher.R
## utility python function for dealing with output paths in globus pipeline
FORMAT_GLOBUS_PREFIXES := $(SHARED_SOURCE)/format_globus_prefixes.py
## internal script for creating association results directories and providing their prefixes to the make DAG
INITIALIZE_OUTPUT_DIRECTORIES := $(CONDA_PREFIX)/bin/initialize_output_directories.out
## yaml configuration for tracking file extensions; processed into Make variable space at end of this file
EXTENSION_CONFIG := $(CONDA_PREFIX)/share/initialize_output_directories/extensions.config.yaml
## simple R script for manipulating categorical data
PARSE_CATEGORICAL_OUTPUT := $(SHARED_SOURCE)/parse_categorical_output.R
## deal with data source for bolt ldscores files
RESOLVE_BOLT_LDSCORE := $(SHARED_SOURCE)/resolve_bolt_ldscore.bash
## R script for deploying the first saige round, relatedness estimation
SAIGE_ROUND_1_SCRIPT := $(SHARED_SOURCE)/saige_round1.R
## saige cutoff parameter for degree of relatedness
SAIGE_ROUND_1_RELATEDNESS_CUTOFF := 0.05
## R script for deploying the second saige round, association
SAIGE_ROUND_2_SCRIPT := $(SHARED_SOURCE)/saige_round2.R
## graf call (relatedness)
GRAF_EXECUTABLE := graf
## graf PlotPopulations perl call
## note that the variant of graf in conda has a patched shebang, such that it can
## be called as an executable correctly
GRAF_POP := PlotPopulations.pl
## location of graf internal reference bim file, for rsID mapping
GRAF_1KG_VARIANT_BIMFILE := $(CONDA_PREFIX)/share/graf/G1000FpGeno.bim
## sanitized graf ancestry groups, with " " values replaced by underscores
CLEANED_ANCESTRY_NAMES := European East_Asian Other South_Asian African_American Hispanic1 Hispanic2
## metal meta-analysis call
METAL_EXECUTABLE := metal
## internal script for merging files into CBIIT-approved globus distribution format
MERGE_FILES_FOR_GLOBUS := $(CONDA_PREFIX)/bin/merge_files_for_globus.out
## accepted analysis tools, which should have shared-makefiles/Makefile.NAME available
SUPPORTED_METHODS := saige boltlmm fastgwa
## software subject count restrictions; extremely empirical and subject to change
FASTGWA_MINIMUM_VALID_SUBJECT_COUNT := 2000
BOLTLMM_MINIMUM_VALID_SUBJECT_COUNT := 3000
SAIGE_MINIMUM_VALID_SUBJECT_COUNT := 1001
## for fastGWA, a number of threads to use during run
N_THREADS := 2
## for case/control comparisons (saige), minimum number of available cases
## in a given platform comparison for the analysis to be run at all
MINIMUM_VALID_CASE_COUNT := 30
## chip cleaning parameters
## plink heterozygosity F statistic max value (absolute)
HET_F_MAX := 0.2
## plink IBS/IBD pi_hat minimum value for reporting
PIHAT_MIN := 0.05
## minimum number of samples in a chip/ancestry combination permitted
SAMPLE_SIZE_MIN := 50
## cutoff number for switching between oneshot and parallel runs of plink --genome
SAMPLE_SIZE_LARGE := 10000
## when running plink --genome --parallel, how many jobs to parallelize into
## may need to be increased down the line
GENOME_GZ_JOB_SPLIT := 20
## LDSCORES estimation parameters, for ldsc/bolt backend
LDSC_MAF_THRESHOLD := 0.005
#### Make infrastructure. Should only be edited by an expert.
## define tracking success file suffix
TRACKING_SUCCESS_SUFFIX := .success
## define tracking fail file suffix
TRACKING_FAIL_SUFFIX := .fail
## cluster queue settings
## these are by default set to things that work for cgems/ccad/sge
## if you're on a different system, you'll have to update these
## as well as the handler itself.
## internal script for compatibility between make and qsub/sge
QSUB_JOB_MONITOR := $(CONDA_PREFIX)/bin/qsub_job_monitor.out
## internal script for compatibility between make and sbatch/slurm
SBATCH_JOB_MONITOR := $(CONDA_PREFIX)/bin/sbatch_job_monitor.out
## theoretically there should eventually be more interface programs for other clusters;
## set which one is used here
ACTIVE_JOB_MONITOR := $(QSUB_JOB_MONITOR)
## queue and resource default settings
## note for sbatch you'll want to add memory and cpu settings most likely?
NORMAL_QUEUE := all.q
## NORMAL_QUEUE := norm
NORMAL_TIME := h_rt=23:45:00
LONG_QUEUE := long.q
## LONG_QUEUE := norm
LONG_TIME := h_rt=71:45:00
HUGE_QUEUE := bigmem.q
## HUGE_QUEUE := norm
SHORT_TIME := h_rt=4:30:00
HOUR_TIME := h_rt=1:00:00
## job submission with simple defaults
define sub_handler
echo -e "$(subst $$,\$$,$(subst ",\",$(subst \,\\\\,$(2)))) \nif [[ \"\$$?\" -eq \"0\" ]] ; then \n\ttouch $(1)$(TRACKING_SUCCESS_SUFFIX) && exit 0\nelse\n\ttouch $(1)$(TRACKING_FAIL_SUFFIX) && exit 1\nfi\n" > $(1).command.bash ; \
$(ACTIVE_JOB_MONITOR) -o $(1) -r "$(NORMAL_TIME)" -q $(NORMAL_QUEUE) -c $(1).command.bash -t 10
endef
## job submission for a very long run
define sub_handler_long
echo -e "$(subst $$,\$$,$(subst ",\",$(subst \,\\\\,$(2)))) \nif [[ \"\$$?\" -eq \"0\" ]] ; then \n\ttouch $(1)$(TRACKING_SUCCESS_SUFFIX) && exit 0\nelse\n\ttouch $(1)$(TRACKING_FAIL_SUFFIX) && exit 1\nfi\n" > $(1).command.bash ; \
$(ACTIVE_JOB_MONITOR) -o $(1) -r "$(LONG_TIME)" -q $(LONG_QUEUE) -c $(1).command.bash -t 10
endef
## job submission expecting customizable parameters
define sub_handler_specify_queue_time
echo -e "$(subst $$,\$$,$(subst ",\",$(subst \,\\\\,$(2)))) \nif [[ \"\$$?\" -eq \"0\" ]] ; then \n\ttouch $(1)$(TRACKING_SUCCESS_SUFFIX) && exit 0\nelse\n\ttouch $(1)$(TRACKING_FAIL_SUFFIX) && exit 1\nfi\n" > $(1).command.bash ; \
$(ACTIVE_JOB_MONITOR) -o $(1) -r "$(4)" -q $(3) -c $(1).command.bash -t 10
endef
## log handling without cluster submission
define log_handler
echo -e "$(subst $$,\$$,$(subst ",\",$(subst \,\\\\,$(2)))) \nif [[ \"\$$?\" -eq \"0\" ]] ; then \n\ttouch $(1)$(TRACKING_SUCCESS_SUFFIX) && exit 0\nelse\n\ttouch $(1)$(TRACKING_FAIL_SUFFIX) && exit 1\nfi\n" > $(1).command.bash ; \
rm -f $(1)$(TRACKING_SUCCESS_SUFFIX) $(1)$(TRACKING_FAIL_SUFFIX) ; \
bash $(1).command.bash > $(1).output 2> $(1).error
endef
## convert GRAF-style harmonized ancestries to supercontinent; simplified for now
define resolve_ancestry
$(if $(filter European,$(word 2,$(subst /, ,$(subst $(RESULT_OUTPUT_DIR),,$(1))))),EUR,EAS)
endef
## utility python script to get extensions from yaml with pyyaml
## probably replace this with a prettier solution when you can
GET_SINGLE_YAML_ENTRY := $(SHARED_SOURCE)/get_single_yaml_entry.py
## frequency mode valid options
SUBJECT_MODE := subject
REFERENCE_MODE := reference
## id mode valid options
RSID_MODE := rsid
CHRPOS_MODE := chrpos
## standardized suffixes for tracking files
## below calls only work if python3 is present;
## so these suffixes will be undefined in the `plco-analysis-ldsc` environment.
## but ldsc/Makefile and shared-makefiles/Makefile.ldscores don't use them, so that's ok for now
PYTHON3_PRESENT := $(filter 0,$(shell command -v python3 > /dev/null 2>&1 && echo $$?))
PHENOTYPE_DATASET_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) phenotype-dataset),)
PHENOTYPE_NAME_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) phenotype),)
COVARIATE_NAME_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) covariates),)
TRANSFORMATION_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) general-extensions transformation suffix),)
SEX_SPECIFIC_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) general-extensions sex-specific suffix),)
CONTROL_INCLUSION_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) general-extensions control_inclusion suffix),)
CONTROL_EXCLUSION_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) general-extensions control_exclusion suffix),)
FINALIZATION_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) finalization),)
FREQUENCY_MODE_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) general-extensions frequency_mode suffix),)
ID_MODE_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) general-extensions id_mode suffix),)
CATEGORY_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) categories),)