Skip to content

Commit

Permalink
Merge pull request NCI-CGR#22 from NCI-CGR/development
Browse files Browse the repository at this point in the history
merge dev changes
  • Loading branch information
lightning-auriga authored Feb 2, 2021
2 parents 422e421 + 90c5696 commit b7f4c5f
Show file tree
Hide file tree
Showing 35 changed files with 469 additions and 203 deletions.
72 changes: 49 additions & 23 deletions Makefile.config
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
## installation directory for your project. can be manipulated if you know
## what you're doing
PROJECT_BASE_DIR := /CGF/GWAS/Scans/PLCO/builds/1/plco-analysis
PLCO_INSTALL_PREFIX := /CGF/GWAS/Scans/PLCO/builds/1/plco-analysis
## names of chips that should be processed for the study. these can be
## arbitrary names; however: underscores ("_") are used to determine batches
## for platforms split up into multiple imputations, so do not make up chip
Expand All @@ -18,14 +19,14 @@ PLATFORMS := OmniX Omni25 Omni5 Oncoarray $(patsubst %,GSA_batch%,$(shell seq 1
## location of chip freeze files; if you have access to the relevant spaces on cgens,
## you can use those; otherwise, set this to where you've placed your chip files from
## your DUPS request or similar other data download
CHIP_FREEZE_INPUT_DIR := $(PROJECT_BASE_DIR)/../current-chip-final-subjects
CHIP_FREEZE_INPUT_DIR := $(PLCO_INSTALL_PREFIX)/../current-chip-final-subjects
## location of cross-platform duplicate resolution file, by default named
## 'PLCO_final_subject_list_Ancestry_UniqGenotypePlatform_04132020.txt'
EXTERNAL_FILE_INPUT_DIR := $(PROJECT_BASE_DIR)/external-files
EXTERNAL_FILE_INPUT_DIR := $(PLCO_INSTALL_PREFIX)/external-files
## location and name of duplicates file
UNIQUE_SUBJECT_LIST := $(EXTERNAL_FILE_INPUT_DIR)/PLCO_final_subject_list_Ancestry_UniqGenotypePlatform_04132020.txt
## location of imputed data, after Rsq filtering and duplicate subject removal
FILTERED_IMPUTED_INPUT_DIR := $(PROJECT_BASE_DIR)/../freeze2-imputation/raw-mis-nonoverlapping-subjects
FILTERED_IMPUTED_INPUT_DIR := $(PLCO_INSTALL_PREFIX)/../freeze2-imputation/raw-mis-nonoverlapping-subjects
## location of configuration files for analysis models
CONFIG_INPUT_DIR := $(PROJECT_BASE_DIR)/config
## location of 1000 Genomes reference files. this is *optional*; there is a pipeline
Expand All @@ -48,7 +49,7 @@ KNOWN_SIGNALS_INPUT_DIR := $(PROJECT_BASE_DIR)/known-signals
## format is: plain text, tab-delimited, header row with variable names, single column with subject IDs
## note that this can be overridden (make PHENOTYPE_FILENAME=/path/to/different/file saige) to run
## analyses using custom phenotypes, skipping the standard atlas-style processing mechanism
PHENOTYPE_FILENAME := $(PROJECT_BASE_DIR)/phenotypes/v10/atlas_v10.with_na.augmented.02nov2020.tsv
PHENOTYPE_FILENAME := $(PLCO_INSTALL_PREFIX)/phenotypes/v10/atlas_v10.with_na.augmented.02nov2020.tsv
## header name in phenotype file for ID column
PHENOTYPE_ID_COLNAME := plco_id

Expand Down Expand Up @@ -116,7 +117,7 @@ SMARTPCA := smartpca
SMARTPCA_N_PCS := 20
## ldsc python scripts
LDSC_PY := ldsc.py
MUNGE_SUMSTATS_PI := munge_sumstats.py
MUNGE_SUMSTATS_PY := munge_sumstats.py
## internal script for annotating results files with reference frequencies
ANNOTATE_FREQUENCY := $(CONDA_PREFIX)/bin/annotate_frequency.out
## reference frequency files, by supercontinent
Expand Down Expand Up @@ -166,11 +167,6 @@ CLEANED_ANCESTRY_NAMES := European East_Asian Other South_Asian African_American
METAL_EXECUTABLE := metal
## internal script for merging files into CBIIT-approved globus distribution format
MERGE_FILES_FOR_GLOBUS := $(CONDA_PREFIX)/bin/merge_files_for_globus.out
## internal script for compatibility between make and qsub/sge
QSUB_JOB_MONITOR := $(CONDA_PREFIX)/bin/qsub_job_monitor.out
## theoretically there should eventually be more interface programs for other clusters;
## set which one is used here
ACTIVE_JOB_MONITOR := $(QSUB_JOB_MONITOR)
## accepted analysis tools, which should have shared-makefiles/Makefile.NAME available
SUPPORTED_METHODS := saige boltlmm fastgwa

Expand Down Expand Up @@ -211,31 +207,62 @@ TRACKING_SUCCESS_SUFFIX := .success
## define tracking fail file suffix
TRACKING_FAIL_SUFFIX := .fail

## log handling without qsub submission
define log_handler
echo -e "$(subst $$,\$$,$(subst ",\",$(subst \,\\\\,$(2)))) \nif [[ \"\$$?\" -eq \"0\" ]] ; then \n\ttouch $(1)$(TRACKING_SUCCESS_SUFFIX) && exit 0\nelse\n\ttouch $(1)$(TRACKING_FAIL_SUFFIX) && exit 1\nfi\n" > $(1).command.bash ; \
rm -f $(1)$(TRACKING_SUCCESS_SUFFIX) $(1)$(TRACKING_FAIL_SUFFIX) ; \
bash $(1).command.bash > $(1).output 2> $(1).error
endef

## cluster queue settings
## these are by default set to things that work for cgems/ccad/sge
## if you're on a different system, you'll have to update these
## as well as the handler itself.

## internal script for compatibility between make and qsub/sge
QSUB_JOB_MONITOR := $(CONDA_PREFIX)/bin/qsub_job_monitor.out
## internal script for compatibility between make and sbatch/slurm
SBATCH_JOB_MONITOR := $(CONDA_PREFIX)/bin/sbatch_job_monitor.out
## theoretically there should eventually be more interface programs for other clusters;
## set which one is used here
ACTIVE_JOB_MONITOR := $(QSUB_JOB_MONITOR)

## queue and resource default settings
## note for sbatch you'll want to add memory and cpu settings most likely?
NORMAL_QUEUE := all.q
## NORMAL_QUEUE := norm
NORMAL_TIME := h_rt=23:45:00
LONG_QUEUE := long.q
## LONG_QUEUE := norm
LONG_TIME := h_rt=71:45:00
HUGE_QUEUE := bigmem.q
## HUGE_QUEUE := norm
SHORT_TIME := h_rt=4:30:00
HOUR_TIME := h_rt=1:00:00


## job submission with simple defaults
define qsub_handler
define sub_handler
echo -e "$(subst $$,\$$,$(subst ",\",$(subst \,\\\\,$(2)))) \nif [[ \"\$$?\" -eq \"0\" ]] ; then \n\ttouch $(1)$(TRACKING_SUCCESS_SUFFIX) && exit 0\nelse\n\ttouch $(1)$(TRACKING_FAIL_SUFFIX) && exit 1\nfi\n" > $(1).command.bash ; \
$(ACTIVE_JOB_MONITOR) -o $(1) -r h_rt=23:10:00 -q all.q -c $(1).command.bash -t 10
$(ACTIVE_JOB_MONITOR) -o $(1) -r "$(NORMAL_TIME)" -q $(NORMAL_QUEUE) -c $(1).command.bash -t 10
endef

## job submission for a very long run
define qsub_handler_long
define sub_handler_long
echo -e "$(subst $$,\$$,$(subst ",\",$(subst \,\\\\,$(2)))) \nif [[ \"\$$?\" -eq \"0\" ]] ; then \n\ttouch $(1)$(TRACKING_SUCCESS_SUFFIX) && exit 0\nelse\n\ttouch $(1)$(TRACKING_FAIL_SUFFIX) && exit 1\nfi\n" > $(1).command.bash ; \
$(ACTIVE_JOB_MONITOR) -o $(1) -r h_rt=71:00:00 -q long.q -c $(1).command.bash -t 10
$(ACTIVE_JOB_MONITOR) -o $(1) -r "$(LONG_TIME)" -q $(LONG_QUEUE) -c $(1).command.bash -t 10
endef

## job submission expecting customizable parameters
define qsub_handler_specify_queue_time
define sub_handler_specify_queue_time
echo -e "$(subst $$,\$$,$(subst ",\",$(subst \,\\\\,$(2)))) \nif [[ \"\$$?\" -eq \"0\" ]] ; then \n\ttouch $(1)$(TRACKING_SUCCESS_SUFFIX) && exit 0\nelse\n\ttouch $(1)$(TRACKING_FAIL_SUFFIX) && exit 1\nfi\n" > $(1).command.bash ; \
$(ACTIVE_JOB_MONITOR) -o $(1) -r "$(4)" -q $(3) -c $(1).command.bash -t 10
endef



## log handling without cluster submission
define log_handler
echo -e "$(subst $$,\$$,$(subst ",\",$(subst \,\\\\,$(2)))) \nif [[ \"\$$?\" -eq \"0\" ]] ; then \n\ttouch $(1)$(TRACKING_SUCCESS_SUFFIX) && exit 0\nelse\n\ttouch $(1)$(TRACKING_FAIL_SUFFIX) && exit 1\nfi\n" > $(1).command.bash ; \
$(ACTIVE_JOB_MONITOR) -o $(1) -r h_rt=$(if $(4),$(4),23:00:00) -q $(if $(3),$(3),all.q) -c $(1).command.bash -t 10
rm -f $(1)$(TRACKING_SUCCESS_SUFFIX) $(1)$(TRACKING_FAIL_SUFFIX) ; \
bash $(1).command.bash > $(1).output 2> $(1).error
endef


## convert GRAF-style harmonized ancestries to supercontinent; simplified for now
define resolve_ancestry
$(if $(filter European,$(word 2,$(subst /, ,$(subst $(RESULT_OUTPUT_DIR),,$(1))))),EUR,EAS)
Expand Down Expand Up @@ -269,4 +296,3 @@ FINALIZATION_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML
FREQUENCY_MODE_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) general-extensions frequency_mode suffix),)
ID_MODE_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) general-extensions id_mode suffix),)
CATEGORY_TRACKER_SUFFIX := $(if $(PYTHON3_PRESENT),$(shell $(GET_SINGLE_YAML_ENTRY) $(EXTENSION_CONFIG) categories),)

12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ that can be modularly added, either with Make pipelines or tools in other langua

### Installation Instructions

See installation instructions [here](https://plco-analysis.readthedocs.io/en/latest/Installation.html)
See installation instructions [on readthedocs](https://plco-analysis.readthedocs.io/en/latest/Installation.html)

### Development Schedule
##### v1.0 (approximately corresponding to PLCO Atlas tranche 1 release)
##### v2.0 (platform-independent build for PLCO Atlas tranche 2)
- [x] BOLT-LMM support
- [x] fastGWA support
- [x] SAIGE support (binary traits)
Expand All @@ -40,21 +40,21 @@ See installation instructions [here](https://plco-analysis.readthedocs.io/en/lat
- [x] heuristic testing to support above
- [x] hunt down last untracked auxiliary files
- [x] complete (straightforward and documented) platform independence with conda
- [ ] documentation: R-style vignette for generalized usage
- [x] documentation: R-style vignette for generalized usage
- [x] this README

##### v2.0 (approximately corresponding with the end of PLCO Atlas)
##### v3.0 (approximately corresponding with the end of PLCO Atlas)
- [ ] polmm/ordinal phenotype support
- [ ] top-level parameter exposure for analysis tools
- [ ] slurm support
- [ ] validated slurm support
- [ ] scalable testing with per-test dependency specification
- [ ] force post-primary analysis tools to ignore analysis results absent from config
- [ ] heuristic testing to support above
- [ ] documentation: full installation for multiple platforms, clusters; possibly docker
- [ ] documentation: doxygen support
- [ ] this README

##### v3.0 (the Confluence build)
##### v4.0 (the Confluence build)
- [ ] config-level parameter exposure for analysis tools
- [ ] integration of external meta-analysis files
- [ ] distributed meta-analysis best practice QC measures
Expand Down
4 changes: 2 additions & 2 deletions ancestry/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,15 @@ Oncoarray.graf_estimates.modified.txt$(TRACKING_SUCCESS_SUFFIX): $$(subst .modif
## output: {CHIP}.graf_estimates.raw.txt$(TRACKING_SUCCESS_SUFFIX)
## input: {CHIP}.graf_pop$(TRACKING_SUCCESS_SUFFIX)
%.graf_estimates.raw.txt$(TRACKING_SUCCESS_SUFFIX): %.graf_pop$(TRACKING_SUCCESS_SUFFIX)
-$(call qsub_handler,$(subst $(TRACKING_SUCCESS_SUFFIX),,$@),$(GRAF_POP) $(subst $(TRACKING_SUCCESS_SUFFIX),,$<) $(subst $(TRACKING_SUCCESS_SUFFIX),,$@))
-$(call sub_handler,$(subst $(TRACKING_SUCCESS_SUFFIX),,$@),$(GRAF_POP) $(subst $(TRACKING_SUCCESS_SUFFIX),,$<) $(subst $(TRACKING_SUCCESS_SUFFIX),,$@))
rm -f $(subst $(TRACKING_SUCCESS_SUFFIX),$(TRACKING_FAIL_SUFFIX),$@) && touch $@

## patterns:
## output: {CHIP}.graf_pop$(TRACKING_SUCCESS_SUFFIX)
## input: {PROJECT_BASE_DIR}/relatedness/{CHIP}.fpg
## Notes: input is the output of the relatedness pipeline, which needs to complete before this pipeline is run.
%.graf_pop$(TRACKING_SUCCESS_SUFFIX): $(REL_DIR)/%.fpg
-$(call qsub_handler,$(subst $(TRACKING_SUCCESS_SUFFIX),,$@),$(GRAF_EXECUTABLE) -geno $< -pop $(subst $(TRACKING_SUCCESS_SUFFIX),,$@))
-$(call sub_handler,$(subst $(TRACKING_SUCCESS_SUFFIX),,$@),$(GRAF_EXECUTABLE) -geno $< -pop $(subst $(TRACKING_SUCCESS_SUFFIX),,$@))
rm -f $(subst $(TRACKING_SUCCESS_SUFFIX),$(TRACKING_FAIL_SUFFIX),$@) && touch $@

## test set to run after completion
Expand Down
6 changes: 3 additions & 3 deletions bgen/Makefile.bgen_format
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ $(filter %-noNAs.sample$(TRACKING_SUCCESS_SUFFIX),$(ALL_TARGETS)): $$(subst -noN
## input: {ANCESTRY}/
## Notes: read pgen files with haplotype dosages into plink, emit bgen 1.2 dosages.
$(filter %.bgen$(TRACKING_SUCCESS_SUFFIX),$(ALL_TARGETS)): $$(subst .bgen$(TRACKING_SUCCESS_SUFFIX),.pgen$(TRACKING_SUCCESS_SUFFIX),$$@) $(firstword $(subst /, ,$(PROJECT_CODE))).unique.prioritized.keep$(TRACKING_SUCCESS_SUFFIX) | $$(dir $$@)
$(call qsub_handler,$(subst $(TRACKING_SUCCESS_SUFFIX),,$@),$(PLINK2) --pfile $(subst .pgen$(TRACKING_SUCCESS_SUFFIX),,$<) --keep $(subst $(TRACKING_SUCCESS_SUFFIX),,$(word 2,$^)) --recode bgen-1.2 bits=8 --out $(subst .bgen$(TRACKING_SUCCESS_SUFFIX),,$@))
$(call sub_handler,$(subst $(TRACKING_SUCCESS_SUFFIX),,$@),$(PLINK2) --pfile $(subst .pgen$(TRACKING_SUCCESS_SUFFIX),,$<) --keep $(subst $(TRACKING_SUCCESS_SUFFIX),,$(word 2,$^)) --recode bgen-1.2 bits=8 --out $(subst .bgen$(TRACKING_SUCCESS_SUFFIX),,$@))

## patterns:
## output: {ANCESTRY}/chr{CHR}-filtered.pgen$(TRACKING_SUCCESS_SUFFIX)
Expand All @@ -55,14 +55,14 @@ $(filter %.bgen$(TRACKING_SUCCESS_SUFFIX),$(ALL_TARGETS)): $$(subst .bgen$(TRACK
## this has already been handled upstream, but note that this step would remove genotype probabilities. also note that this has to be a separate
## step from the bgen write step because erase-phase is apparently specific to --make-pgen
$(subst .bgen$(TRACKING_SUCCESS_SUFFIX),.pgen$(TRACKING_SUCCESS_SUFFIX),$(filter %.bgen$(TRACKING_SUCCESS_SUFFIX),$(ALL_TARGETS))): $$(FILTERED_IMPUTED_DIR)/$$(PROJECT_CODE)/$$(subst .pgen$(TRACKING_SUCCESS_SUFFIX),.dose.vcf.gz,$$@) | $$(dir $$@)
$(call qsub_handler,$(subst $(TRACKING_SUCCESS_SUFFIX),,$@),$(PLINK2) --vcf $< dosage=HDS --id-delim _ --make-pgen erase-phase --out $(subst .pgen$(TRACKING_SUCCESS_SUFFIX),,$@))
$(call sub_handler,$(subst $(TRACKING_SUCCESS_SUFFIX),,$@),$(PLINK2) --vcf $< dosage=HDS --id-delim _ --make-pgen erase-phase --out $(subst .pgen$(TRACKING_SUCCESS_SUFFIX),,$@))

## patterns:
## output: {ANCESTRY}/chr{CHR}-filtered.bgen.bgi$(TRACKING_SUCCESS_SUFFIX)
## input: {ANCESTRY}/chr{CHR}-filtered.bgen$(TRACKING_SUCCESS_SUFFIX)
## Notes: use bgenix to index the bgen file for use with downstream applications. -clobber required for pipeline reruns.
%.bgi$(TRACKING_SUCCESS_SUFFIX): %$(TRACKING_SUCCESS_SUFFIX)
$(call qsub_handler,$(subst $(TRACKING_SUCCESS_SUFFIX),,$@),$(BGENIX) -g $(subst $(TRACKING_SUCCESS_SUFFIX),,$<) -index -clobber)
$(call sub_handler,$(subst $(TRACKING_SUCCESS_SUFFIX),,$@),$(BGENIX) -g $(subst $(TRACKING_SUCCESS_SUFFIX),,$<) -index -clobber)

## patterns:
## output: {ANCESTRY}/
Expand Down
Loading

0 comments on commit b7f4c5f

Please sign in to comment.