Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions v03_pipeline/lib/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,20 @@ def variant_annotations_vcf_path(
)


def variant_annotations_parquet_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
) -> str:
return os.path.join(
pipeline_prefix(
Env.PIPELINE_DATA_DIR,
reference_genome,
dataset_type,
),
'annotations.parquet',
)


def new_entries_parquet_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
Expand Down
65 changes: 65 additions & 0 deletions v03_pipeline/lib/tasks/exports/write_annotations_parquet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import hail as hl
import luigi
import luigi.util

from v03_pipeline.lib.paths import (
variant_annotations_parquet_path,
variant_annotations_table_path,
)
from v03_pipeline.lib.tasks.base.base_loading_run_params import (
BaseLoadingRunParams,
)
from v03_pipeline.lib.tasks.base.base_write_parquet import BaseWriteParquetTask
from v03_pipeline.lib.tasks.exports.fields import get_variants_export_fields
from v03_pipeline.lib.tasks.exports.misc import (
camelcase_array_structexpression_fields,
drop_unexported_fields,
unmap_formatting_annotation_enums,
unmap_reference_dataset_annotation_enums,
)
from v03_pipeline.lib.tasks.files import GCSorLocalTarget
from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import (
UpdateVariantAnnotationsTableWithNewSamplesTask,
)


@luigi.util.inherits(BaseLoadingRunParams)
class WriteVariantAnnotationsParquetTask(BaseWriteParquetTask):
def output(self) -> luigi.Target:
return GCSorLocalTarget(
variant_annotations_parquet_path(
self.reference_genome,
self.dataset_type,
),
)

def requires(self) -> luigi.Task:
return self.clone(UpdateVariantAnnotationsTableWithNewSamplesTask)

def create_table(self) -> None:
ht = hl.read_table(
variant_annotations_table_path(
self.reference_genome,
self.dataset_type,
),
)
ht = drop_unexported_fields(ht)
ht = unmap_formatting_annotation_enums(
ht,
self.reference_genome,
self.dataset_type,
)
ht = unmap_reference_dataset_annotation_enums(
ht,
self.reference_genome,
self.dataset_type,
)
ht = camelcase_array_structexpression_fields(
ht,
self.reference_genome,
self.dataset_type,
)
ht = ht.key_by()
return ht.select(
**get_variants_export_fields(ht, self.reference_genome, self.dataset_type),
)
228 changes: 228 additions & 0 deletions v03_pipeline/lib/tasks/exports/write_annotations_parquet_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
import hail as hl
import luigi.worker
import pandas as pd

from v03_pipeline.lib.misc.io import remap_pedigree_hash
from v03_pipeline.lib.model import (
DatasetType,
ReferenceGenome,
SampleType,
)
from v03_pipeline.lib.paths import (
project_pedigree_path,
variant_annotations_parquet_path,
variant_annotations_table_path,
)
from v03_pipeline.lib.tasks.exports.write_annotations_parquet import (
WriteVariantAnnotationsParquetTask,
)
from v03_pipeline.lib.test.misc import (
convert_ndarray_to_list,
copy_project_pedigree_to_mocked_dir,
)
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase

TEST_SNV_INDEL_ANNOTATIONS = (
'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht'
)
TEST_PEDIGREE_3_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_3_remap.tsv'

TEST_RUN_ID = 'manual__2024-04-03'


class WriteAnnotationsParquetTest(MockedDatarootTestCase):
def setUp(self) -> None:
super().setUp()
copy_project_pedigree_to_mocked_dir(
TEST_PEDIGREE_3_REMAP,
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
SampleType.WGS,
'R0113_test_project',
)
ht = hl.read_table(
TEST_SNV_INDEL_ANNOTATIONS,
)
ht = ht.annotate_globals(
updates=ht.updates.add(
hl.Struct(
callset='fake_callset',
project_guid='R0113_test_project',
remap_pedigree_hash=remap_pedigree_hash(
project_pedigree_path(
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
SampleType.WGS,
'R0113_test_project',
),
),
),
),
)
ht.write(
variant_annotations_table_path(
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
),
)

def test_write_annotations_parquet_test(
self,
) -> None:
worker = luigi.worker.Worker()
task = WriteVariantAnnotationsParquetTask(
reference_genome=ReferenceGenome.GRCh38,
dataset_type=DatasetType.SNV_INDEL,
sample_type=SampleType.WGS,
callset_path='fake_callset',
project_guids=[
'R0113_test_project',
],
skip_validation=True,
run_id=TEST_RUN_ID,
)
worker.add(task)
worker.run()
self.assertTrue(task.output().exists())
self.assertTrue(task.complete())
df = pd.read_parquet(
variant_annotations_parquet_path(
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
),
)
export_json = convert_ndarray_to_list(df.head(1).to_dict('records'))
export_json[0]['sortedTranscriptConsequences'] = [
export_json[0]['sortedTranscriptConsequences'][0],
]
self.assertEqual(
export_json,
[
{
'key': 0,
'xpos': 1000876499,
'chrom': '1',
'pos': 876499,
'ref': 'A',
'alt': 'G',
'variantId': '1-876499-A-G',
'rsid': None,
'CAID': 'CA502654',
'liftedOverChrom': '1',
'liftedOverPos': 874501,
'hgmd': {'accession': 'abcdefg', 'classification': 'DFP'},
'screenRegionType': None,
'predictions': {
'cadd': 23.5,
'eigen': 2.628000020980835,
'fathmm': 0.7174800038337708,
'gnomad_noncoding': None,
'mpc': 0.01291007362306118,
'mut_pred': None,
'mut_tester': 'D',
'polyphen': 0.164000004529953,
'primate_ai': 0.5918066501617432,
'revel': 0.3109999895095825,
'sift': 0.0010000000474974513,
'splice_ai': 0.0,
'splice_ai_consequence': 'No consequence',
'vest': 0.39500001072883606,
},
'populations': {
'exac': {
'ac': 20,
'af': 0.00019039999460801482,
'an': 47974,
'filter_af': 0.0007150234305299819,
'hemi': None,
'het': 20,
'hom': 0,
},
'gnomad_exomes': {
'ac': 964,
'af': 0.0006690866430290043,
'an': 1440770,
'filter_af': 0.0008023773552849889,
'hemi': 0,
'hom': 0,
},
'gnomad_genomes': {
'ac': 42,
'af': 0.0002759889466688037,
'an': 152180,
'filter_af': 0.10000000149011612,
'hemi': 0,
'hom': 0,
},
'topmed': {
'ac': 41,
'af': 0.00032651599030941725,
'an': 125568,
'het': 41,
'hom': 0,
},
},
'sortedMotifFeatureConsequences': [
{
'motifFeatureId': 'ENSM00493959715',
'consequenceTerms': ['TF_binding_site_variant'],
},
],
'sortedRegulatoryFeatureConsequences': [
{
'regulatoryFeatureId': 'ENSR00000344437',
'biotype': 'CTCF_binding_site',
'consequenceTerms': ['regulatory_region_variant'],
},
],
'sortedTranscriptConsequences': [
{
'aminoAcids': 'S/L',
'canonical': 1.0,
'codons': 'tCg/tTg',
'geneId': 'ENSG00000187634',
'hgvsc': 'ENST00000616016.5:c.1049C>T',
'hgvsp': 'ENSP00000478421.2:p.Ser350Leu',
'transcriptId': 'ENST00000616016',
'maneSelect': 'NM_001385641.1',
'manePlusClinical': None,
'exon': {'index': 6, 'total': 14},
'intron': None,
'refseqTranscriptId': 'NM_001385641.1',
'alphamissense': {'pathogenicity': None},
'loftee': {'isLofNagnag': None, 'lofFilters': None},
'spliceregion': {
'extended_intronic_splice_region_variant': False,
},
'utrannotator': {
'existingInframeOorfs': None,
'existingOutofframeOorfs': None,
'existingUorfs': None,
'fiveutrAnnotation': {
'type': 'OutOfFrame_oORF',
'KozakContext': 'CGCATGC',
'KozakStrength': 'Weak',
'DistanceToCDS': 41,
'CapDistanceToStart': None,
'DistanceToStop': None,
'Evidence': None,
'AltStop': None,
'AltStopDistanceToCDS': None,
'FrameWithCDS': None,
'StartDistanceToCDS': None,
'newSTOPDistanceToCDS': None,
'alt_type': None,
'alt_type_length': None,
'ref_StartDistanceToCDS': None,
'ref_type': None,
'ref_type_length': None,
},
'fiveutrConsequence': None,
},
'biotype': 'protein_coding',
'consequenceTerms': ['missense_variant'],
},
],
},
],
)