Skip to content

Commit

Permalink
Merge pull request #36 from eastgenomics/DI-366-v2.1.0
Browse files Browse the repository at this point in the history
Di 366 v2.1.0 (#36)

Co-Authored-By: Jay Miles <jay.miles@addenbrookes.nhs.uk>
  • Loading branch information
Addy81 and Jay-Miles authored Sep 4, 2023
2 parents 3ef7992 + e791df1 commit eb08b78
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 78 deletions.
26 changes: 25 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,26 @@
# egg4_dias_TWE_config
Config for the Dias TWE assay

This repo contains a Python config file which is used with dias_batch_running to specify inputs for running the Dias pipeline for TWE data.

## What does the config do?
dias_batch_running ([https://github.com/eastgenomics/dias_batch_running](https://github.com/eastgenomics/dias_batch_running)) is a Python module that runs the Dias pipeline for germline sequence data analysis on DNAnexus. The egg5_dias_TWE_config specifies the executables and their input files to be used in the Dias pipeline for analysing TWE data.

New versions of apps and app inputs for use in the Dias pipeline can be updated in the config without needing to update the pipeline itself.

## Parts of the config
* dias_reports
* specifies the workflow ID, stage IDs (matching those in the workflow), and dynamic files for dias_reports.

## Versions of workflows and dynamic files in the config
Workflows:
* Dias reports: **dias_reports_v2.1.0**
* DNAnexus workflow ID: `workflow-GXzkfYj4QPQp9z4Jz4BF09y6`

Dynamic files:
| File | File name | DNAnexus file ID |
| --------- | --------- | ---------------- |
| genepanels | **230602_genepanels.tsv** | `file-GVx0vkQ433Gvq63k1Kj4Y562` |
| genes2transcripts | **230421_g2t.tsv** | `file-GV4P970433Gj6812zGVBZvB4` |
| exons_nirvana | **GCF_000001405.25_GRCh37.p13_genomic.exon_5bp_v2.0.0.tsv** | `file-GF611Z8433Gk7gZ47gypK7ZZ` |
| exons_file for eggd_athena | **GCF_000001405.25_GRCh37.p13_genomic.symbols.exon_5bp_v2.0.0.tsv** | `file-GF611Z8433Gf99pBPbJkV7bq` |
| twe_vep_config for SNV reports | **twe_vep_config_v1.1.6.json** | `file-GYX8q204j4fpP18Qx7YGkJvX` |
77 changes: 0 additions & 77 deletions egg4_config_v2.0.4.py

This file was deleted.

89 changes: 89 additions & 0 deletions egg4_config_v2.1.0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
assay_name = "TWE" # Twist Whole Exome
assay_version = "v2.1.0"

ref_project_id = "project-Fkb6Gkj433GVVvj73J7x8KbV"

### Dynamic files:

## for generate_bed
# genepanels 230602
genepanels_file = "{}:file-GVx0vkQ433Gvq63k1Kj4Y562".format(ref_project_id)
# g2t 230421
genes2transcripts = "{}:file-GV4P970433Gj6812zGVBZvB4".format(ref_project_id)
# GCF_000001405.25_GRCh37.p13_genomic.exon_5bp_v2.0.0.tsv
exons_nirvana = "{}:file-GF611Z8433Gk7gZ47gypK7ZZ".format(ref_project_id)

# for generate_bed_for_VEP
vep_bed_flank = 495

## for eggd_Athena
# GCF_000001405.25_GRCh37.p13_genomic.symbols.exon_5bp_v2.0.0.tsv
exons_file = "{}:file-GF611Z8433Gf99pBPbJkV7bq".format(ref_project_id)

## for eggd_VEP
# VEP config file for SNV reports v1.1.6
vep_config = "{}:file-GYX8q204j4fpP18Qx7YGkJvX".format(ref_project_id)


### Apps and workflows:

# dias_reports
# v2.1.0
rpt_workflow_id = "{}:workflow-GXzkfYj4QPQp9z4Jz4BF09y6".format(ref_project_id)

generate_bed_vep_stage_id = "stage-rpt_generate_bed_vep"
vep_stage_id = "stage-rpt_vep"
generate_workbook_stage_id = "stage-rpt_generate_workbook"
generate_bed_athena_stage_id = "stage-rpt_generate_bed_athena"
athena_stage_id = "stage-rpt_athena"

rpt_dynamic_files = {
# inputs for generate bed for vep
"{}.exons_nirvana ID".format(generate_bed_vep_stage_id): exons_nirvana,
"{}.exons_nirvana".format(generate_bed_vep_stage_id): "",
"{}.nirvana_genes2transcripts ID".format(generate_bed_vep_stage_id): genes2transcripts,
"{}.nirvana_genes2transcripts".format(generate_bed_vep_stage_id): "",
"{}.gene_panels ID".format(generate_bed_vep_stage_id): genepanels_file,
"{}.gene_panels".format(generate_bed_vep_stage_id): "",
# inputs for eggd_vep
"{}.config_file ID".format(vep_stage_id): vep_config,
"{}.config_file".format(vep_stage_id): "",
# inputs for generate_variant_workbook
"{}.exclude_columns".format(generate_workbook_stage_id): "BaseQRankSum ClippingRankSum DB ExcessHet FS MLEAC MLEAF MQ MQRankSum QD ReadPosRankSum SOR PL QUAL ID FILTER CSQ_ClinVar_CLNSIGCONF CSQ_Allele CSQ_HGNC_ID DP AC AF AN CSQ_SpliceAI_pred_DP_AL CSQ_SpliceAI_pred_DP_AG CSQ_SpliceAI_pred_DP_DG CSQ_SpliceAI_pred_DP_DL",
"{}.acmg".format(generate_workbook_stage_id): "true",
"{}.rename_columns".format(generate_workbook_stage_id): "CSQ_Feature=Transcript DP_FMT=DP",
"{}.add_comment_column".format(generate_workbook_stage_id): "true",
"{}.keep_tmp".format(generate_workbook_stage_id): "true",
"{}.summary".format(generate_workbook_stage_id): "dias",
"{}.filter".format(generate_workbook_stage_id): "bcftools filter -e '(CSQ_Consequence==\"synonymous_variant\" | CSQ_Consequence==\"intron_variant\" | CSQ_Consequence==\"upstream_gene_variant\" | CSQ_Consequence==\"downstream_gene_variant\" | CSQ_Consequence==\"intergenic_variant\" | CSQ_Consequence==\"5_prime_UTR_variant\" | CSQ_Consequence==\"3_prime_UTR_variant\" | CSQ_gnomADe_AF>0.01 | CSQ_gnomADg_AF>0.01 | CSQ_TWE_AF>0.05) & CSQ_HGMD_CLASS!~ \"DM\" & CSQ_ClinVar_CLNSIG!~ \"pathogenic\\/i\" & CSQ_ClinVar_CLNSIGCONF!~ \"pathogenic\\/i\"'",
"{}.human_filter".format(generate_workbook_stage_id): "excluded gnomAD exomes / genomes > 1%, TWE > 5%, synonymous / intronic / intergenic / upstream / downstream / UTRs EXCEPT pathogenic status in ClinVar OR DM in HGMD Class",
"{}.reorder_columns".format(generate_workbook_stage_id): "CHROM POS REF ALT GT GQ DP_FMT AD CSQ_SYMBOL CSQ_EXON CSQ_INTRON CSQ_HGVSc CSQ_HGVSp CSQ_Consequence CSQ_IMPACT CSQ_VARIANT_CLASS CSQ_gnomADe_AF CSQ_gnomADe_Hom CSQ_gnomADe_AC CSQ_gnomADe_AN CSQ_gnomADg_AF CSQ_gnomADg_AC CSQ_gnomADg_AN CSQ_TWE_AF CSQ_TWE_AC_Hom CSQ_TWE_AC_Het CSQ_TWE_AN CSQ_HGMD CSQ_HGMD_CLASS CSQ_HGMD_RANKSCORE CSQ_HGMD_PHEN CSQ_Existing_variation CSQ_ClinVar CSQ_ClinVar_CLNDN CSQ_ClinVar_CLNSIG CSQ_Mastermind_MMID3 CSQ_CADD_PHRED CSQ_REVEL CSQ_SpliceAI_pred_DS_AG CSQ_SpliceAI_pred_DS_AL CSQ_SpliceAI_pred_DS_DG CSQ_SpliceAI_pred_DS_DL CSQ_HGVS_OFFSET CSQ_STRAND CSQ_Feature",
# inputs for generate bed for athena
"{}.exons_nirvana ID".format(generate_bed_athena_stage_id): exons_nirvana,
"{}.exons_nirvana".format(generate_bed_athena_stage_id): "",
"{}.nirvana_genes2transcripts ID".format(generate_bed_athena_stage_id): genes2transcripts,
"{}.nirvana_genes2transcripts".format(generate_bed_athena_stage_id): "",
"{}.gene_panels ID".format(generate_bed_athena_stage_id): genepanels_file,
"{}.gene_panels".format(generate_bed_athena_stage_id): "",
# inputs for athena
"{}.exons_file ID".format(athena_stage_id): exons_file,
"{}.exons_file".format(athena_stage_id): "",
"{}.limit".format(athena_stage_id): "260",
"{}.summary".format(athena_stage_id): "true"
}

# Sample-specific input files and their search patterns
rpt_stage_input_dict = {
# eggd_vep
"{}.vcf".format(vep_stage_id): {
"app": "sentieon-dnaseq", "subdir": "",
"pattern": "-E '{}(.*)[^g].vcf.gz$'"
},
# eggd_athena
"{}.mosdepth_files".format(athena_stage_id): {
"app": "mosdepth", "subdir": "",
# athena requires both per-base files and reference files
"pattern": "-E '{}(.*)(per-base.bed.gz$|reference)'"
},
}

0 comments on commit eb08b78

Please sign in to comment.