Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VS-1476. Removing All usage of VQSR from VDS creation. #9015

Open
wants to merge 6 commits into
base: ah_var_store
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions scripts/variantstore/wdl/GvsCreateVDS.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ workflow GvsCreateVDS {
Int? cluster_max_age_minutes
Boolean leave_cluster_running_at_end = false
Float? master_memory_fraction
Boolean use_VQSR = false

String? git_branch_or_tag
String? hail_version
Expand Down Expand Up @@ -112,7 +111,6 @@ workflow GvsCreateVDS {
prefix = cluster_prefix,
vds_path = vds_destination_path,
avro_path = avro_path,
use_VQSR = use_VQSR,
hail_version = effective_hail_version,
hail_wheel = hail_wheel,
hail_temp_path = hail_temp_path,
Expand Down Expand Up @@ -158,7 +156,6 @@ task CreateVds {
String prefix
String vds_path
String avro_path
Boolean use_VQSR
Boolean leave_cluster_running_at_end
File hail_gvs_import_script
File gvs_import_script
Expand Down Expand Up @@ -233,7 +230,6 @@ task CreateVds {
"temp-path": "${hail_temp_path}",
"avro-path": "~{avro_path}"
~{', "intermediate-resume-point": ' + intermediate_resume_point}
~{true=', "use-vqsr": ""' false='' use_VQSR}
}
FIN

Expand Down
29 changes: 3 additions & 26 deletions scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ workflow GvsExtractAvroFilesForHail {
String dataset_name
String filter_set_name
String call_set_identifier
Boolean use_VETS = true
Int scatter_width = 10
String? basic_docker
String? cloud_sdk_docker
Expand Down Expand Up @@ -43,14 +42,6 @@ workflow GvsExtractAvroFilesForHail {
cloud_sdk_docker = effective_cloud_sdk_docker,
}

call Utils.IsVETS {
input:
project_id = project_id,
fq_filter_set_info_table = "~{project_id}.~{dataset_name}.filter_set_info",
filter_set_name = filter_set_name,
cloud_sdk_docker = effective_cloud_sdk_docker,
}

call OutputPath {
input:
go = ValidateFilterSetName.done,
Expand All @@ -74,7 +65,6 @@ workflow GvsExtractAvroFilesForHail {
filter_set_name = filter_set_name,
avro_sibling = OutputPath.out,
call_set_identifier = call_set_identifier,
is_vets = IsVETS.is_vets,
variants_docker = effective_variants_docker,
}

Expand Down Expand Up @@ -192,7 +182,7 @@ task ExtractFromSampleInfoTable {

task ExtractFromFilterTables {
meta {
description: "Extracts from the tables: filter_set_sites, filter_set_info, and filter_set_tranches (if using VQSR)"
description: "Extracts from the tables: filter_set_sites and filter_set_info"
# Not dealing with caching for now as that would introduce a lot of complexity.
volatile: true
}
Expand All @@ -203,12 +193,9 @@ task ExtractFromFilterTables {
String filter_set_name
String avro_sibling
String call_set_identifier
Boolean is_vets = true
String variants_docker
}

String vqs_score_field = if (is_vets == true) then 'calibration_sensitivity' else 'vqslod'

parameter_meta {
avro_sibling: "Cloud path to a file that will be the sibling to the 'avro' 'directory' under which output Avro files will be written."
}
Expand All @@ -222,8 +209,8 @@ task ExtractFromFilterTables {

python3 /app/run_avro_query.py --sql "
EXPORT DATA OPTIONS(
uri='${avro_prefix}/vqsr_filtering_data/vqsr_filtering_data_*.avro', format='AVRO', compression='SNAPPY') AS
SELECT location, type as model, ref, alt, ~{vqs_score_field}, yng_status
uri='${avro_prefix}/vets_filtering_data/vets_filtering_data_*.avro', format='AVRO', compression='SNAPPY') AS
SELECT location, type as model, ref, alt, calibration_sensitivity, yng_status
FROM \`~{project_id}.~{dataset_name}.filter_set_info\`
WHERE filter_set_name = '~{filter_set_name}'
ORDER BY location
Expand All @@ -237,16 +224,6 @@ task ExtractFromFilterTables {
WHERE filter_set_name = '~{filter_set_name}'
ORDER BY location
" --call_set_identifier ~{call_set_identifier} --dataset_name ~{dataset_name} --table_name filter_set_sites --project_id=~{project_id}

if [ ~{is_vets} = false ]; then
python3 /app/run_avro_query.py --sql "
EXPORT DATA OPTIONS(
uri='${avro_prefix}/vqsr_tranche_data/vqsr_tranche_data_*.avro', format='AVRO', compression='SNAPPY') AS
SELECT model, truth_sensitivity, min_vqslod, filter_name
FROM \`~{project_id}.~{dataset_name}.filter_set_tranches\`
WHERE filter_set_name = '~{filter_set_name}'
" --call_set_identifier ~{call_set_identifier} --dataset_name ~{dataset_name} --table_name filter_set_tranches --project_id=~{project_id}
fi
>>>

output {
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ task GetToolVersions {
# GVS generally uses the smallest `alpine` version of the Google Cloud SDK as it suffices for most tasks, but
# there are a handlful of tasks that require the larger GNU libc-based `slim`.
String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim"
String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-10-17-alpine-6dd528be8d8d"
String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-10-22-alpine-e7443149b8db"
String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19"
String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024_10_10-gatkbase-1cd1f9652cb9"
String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest"
Expand Down
1 change: 0 additions & 1 deletion scripts/variantstore/wdl/GvsValidateVDS.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ workflow GvsValidateVDS {
Int? cluster_max_age_minutes
Boolean leave_cluster_running_at_end = false
Float? master_memory_fraction
Boolean use_classic_VQSR = false

String? git_branch_or_tag
String? hail_version
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def write_chr_vcf(mt, chrom, full_path, metadata, tabix=True):
hl.export_vcf(mt, full_path, tabix=tabix, metadata=metadata)


def make_dense_mt(vds, max_alt_alleles=None, is_keep_as_vqsr_fields=False):
def make_dense_mt(vds, max_alt_alleles=None, is_keep_as_vets_fields=False):
"""
Given a VDS, follow a standard method to densify into a MatrixTable.
This process includes:
Expand All @@ -23,7 +23,7 @@ def make_dense_mt(vds, max_alt_alleles=None, is_keep_as_vqsr_fields=False):
:param vds: input VDS as a gs URL
:param max_alt_alleles (None): Drop any variant sites that have the number of alt alleles exceeding this value.
Specify None to not prune any variant sites by this criteria.
:param is_keep_as_vqsr_fields (False): If True, keep the AS_VQSR fields, but move the fields to the info field.
:param is_keep_as_vets_fields (False): If True, keep the AS_VETS fields, but move the fields to the info field.
:return: a dense MatrixTable
"""
vd_gt = vds.variant_data
Expand Down Expand Up @@ -61,9 +61,9 @@ def make_dense_mt(vds, max_alt_alleles=None, is_keep_as_vqsr_fields=False):
AN=d_callset.variant_qc.AN,
homozygote_count=d_callset.variant_qc.homozygote_count))

if is_keep_as_vqsr_fields and ('as_vqsr' in d_callset.row):
d_callset = d_callset.annotate_rows(info = d_callset.info.annotate(AS_VQSLOD = d_callset.as_vqsr.values().vqslod,
AS_YNG = d_callset.as_vqsr.values().yng_status))
if is_keep_as_vets_fields and ('as_vets' in d_callset.row):
d_callset = d_callset.annotate_rows(info = d_callset.info.annotate(AS_CALIBRATION_SENSITIVITY = d_callset.as_vets.values().calibration_sensitivity,
AS_YNG = d_callset.as_vets.values().yng_status))

# Drop duplicate nested fields that are already in INFO field rendered by call_stats()
d_callset = d_callset.annotate_rows(variant_qc = d_callset.variant_qc.drop("AC", "AF", "AN", "homozygote_count"))
Expand All @@ -72,7 +72,7 @@ def make_dense_mt(vds, max_alt_alleles=None, is_keep_as_vqsr_fields=False):
# 'LGT', 'LA' have already been removed
# Please note that this cell is specific to AoU VDS and fields specified here may not exist in test data.
# This will not fail if a field is missing.
fields_to_drop = ['as_vqsr', 'LAD',
fields_to_drop = ['as_vets', 'LAD',
'tranche_data', 'truth_sensitivity_snp_threshold',
'truth_sensitivity_indel_threshold','snp_vqslod_threshold','indel_vqslod_threshold']
d_callset = d_callset.drop(*(f for f in fields_to_drop if f in d_callset.entry or f in d_callset.row or f in d_callset.col))
Expand Down
6 changes: 3 additions & 3 deletions scripts/variantstore/wdl/extract/gvs_vds_tie_out.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
AF: array<float64>,
AN: int32,
AS_QUALapprox: str,
AS_VQSLOD: array<str>,
AS_CALIBRATION_SENSITIVITY: array<str>,
AS_YNG: array<str>,
QUALapprox: int32
}
Expand All @@ -53,9 +53,9 @@
locus: locus<GRCh38>,
alleles: array<str>,
filters: set<str>,
as_vqsr: dict<str, struct {
as_vets: dict<str, struct {
model: str,
vqslod: float64,
calibration_sensitivity: float64,
yng_status: str
}>
}
Expand Down
16 changes: 5 additions & 11 deletions scripts/variantstore/wdl/extract/hail_gvs_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
gcs_re = re.compile("^gs://(?P<bucket_name>[^/]+)/(?P<object_prefix>.*)$")


def create_vds(argsfn, vds_path, references_path, temp_path, use_vqsr, intermediate_resume_point):
def create_vds(argsfn, vds_path, references_path, temp_path, intermediate_resume_point):
import hail as hl
import import_gvs
from hail.utils.java import Env
Expand All @@ -37,8 +37,7 @@ def create_vds(argsfn, vds_path, references_path, temp_path, use_vqsr, intermedi
refs=argsfn('refs'),
sample_mapping=argsfn('sample_mapping'),
site_filtering_data=argsfn('site_filtering_data'),
vqsr_filtering_data=argsfn('vqsr_filtering_data'),
vqsr_tranche_data=argsfn('vqsr_tranche_data'),
vets_filtering_data=argsfn('vets_filtering_data'),
reference_genome=rg38,
final_path=vds_path,
tmp_dir=f'{temp_path}/hail_tmp_create_vds',
Expand All @@ -47,7 +46,6 @@ def create_vds(argsfn, vds_path, references_path, temp_path, use_vqsr, intermedi
# partitions_per_sample=0.35, # check with Hail about how to tune this for your large callset
# intermediate_resume_point=0, # if your first run fails, and you want to use the intermediate files that already exist, check in with Hail to find out what stage to resume on
# skip_final_merge=false, # if you want to create your VDS in two steps (because of mem issues) this can be skipped until the final run
use_vqsr=use_vqsr,
intermediate_resume_point=intermediate_resume_point
)
finally:
Expand All @@ -68,8 +66,7 @@ def gcs_generate_avro_args(bucket, blob_prefix, key):
* refs (list of lists, one outer list per GVS superpartition of 4000 samples max)
* sample_mapping (list)
* site_filtering_data (list)
* vqsr_filtering_data (list)
* vqsr_tranche_data (list)
* vets_filtering_data (list)
"""

keyed_prefix = f"{blob_prefix}/{key}/"
Expand Down Expand Up @@ -142,8 +139,6 @@ def regular_handler():
required=True)
parser.add_argument('--references-path', type=str, help='Path to references, only required for local files',
required=False)
parser.add_argument("--use-vqsr", action="store_true",
help="If set, expect that the input GVS Avro files were generated using VQSR")
parser.add_argument('--intermediate-resume-point', type=int, required=False, default=0,
help='Intermediate VDS index at which to resume')

Expand All @@ -152,7 +147,6 @@ def regular_handler():
# Remove trailing slashes if present.
avro_path, temp_path, vds_path = [p if not p.endswith('/') else p[:-1] for p in
[args.avro_path, args.temp_path, args.vds_path]]
use_vqsr = args.use_vqsr
is_gcs = [gcs_re.match(p) for p in [avro_path, temp_path, vds_path]]
is_not_gcs = [not g for g in is_gcs]

Expand All @@ -163,7 +157,7 @@ def regular_handler():
def arguments(key):
return gcs_generate_avro_args(avro_bucket, avro_object_prefix, key)

create_vds(arguments, vds_path, 'gs://hail-common/references', temp_path, use_vqsr,
create_vds(arguments, vds_path, 'gs://hail-common/references', temp_path,
args.intermediate_resume_point)

elif all(is_not_gcs):
Expand All @@ -176,7 +170,7 @@ def arguments(key):
def arguments(key):
return local_generate_avro_args(avro_path, key)

create_vds(arguments, vds_path, references_path, temp_path, use_vqsr,
create_vds(arguments, vds_path, references_path, temp_path,
args.intermediate_resume_point)
else:
raise ValueError("Arguments appear to be some unsavory mix of GCS and local paths, all or nothing please.")
Loading
Loading