broadinstitute · bpblanken · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
@@ -323,6 +323,20 @@ def variant_annotations_vcf_path(
     )
 
 
+def variant_annotations_parquet_path(
+    reference_genome: ReferenceGenome,
+    dataset_type: DatasetType,
+) -> str:
+    return os.path.join(
+        pipeline_prefix(
+            Env.PIPELINE_DATA_DIR,
+            reference_genome,
+            dataset_type,
+        ),
+        'annotations.parquet',
+    )
+
+
 def new_entries_parquet_path(
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,

@@ -0,0 +1,65 @@
+import hail as hl
+import luigi
+import luigi.util
+
+from v03_pipeline.lib.paths import (
+    variant_annotations_parquet_path,
+    variant_annotations_table_path,
+)
+from v03_pipeline.lib.tasks.base.base_loading_run_params import (
+    BaseLoadingRunParams,
+)
+from v03_pipeline.lib.tasks.base.base_write_parquet import BaseWriteParquetTask
+from v03_pipeline.lib.tasks.exports.fields import get_variants_export_fields
+from v03_pipeline.lib.tasks.exports.misc import (
+    camelcase_array_structexpression_fields,
+    drop_unexported_fields,
+    unmap_formatting_annotation_enums,
+    unmap_reference_dataset_annotation_enums,
+)
+from v03_pipeline.lib.tasks.files import GCSorLocalTarget
+from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import (
+    UpdateVariantAnnotationsTableWithNewSamplesTask,
+)
+
+
+@luigi.util.inherits(BaseLoadingRunParams)
+class WriteVariantAnnotationsParquetTask(BaseWriteParquetTask):
+    def output(self) -> luigi.Target:
+        return GCSorLocalTarget(
+            variant_annotations_parquet_path(
+                self.reference_genome,
+                self.dataset_type,
+            ),
+        )
+
+    def requires(self) -> luigi.Task:
+        return self.clone(UpdateVariantAnnotationsTableWithNewSamplesTask)
+
+    def create_table(self) -> None:
+        ht = hl.read_table(
+            variant_annotations_table_path(
+                self.reference_genome,
+                self.dataset_type,
+            ),
+        )
+        ht = drop_unexported_fields(ht)
+        ht = unmap_formatting_annotation_enums(
+            ht,
+            self.reference_genome,
+            self.dataset_type,
+        )
+        ht = unmap_reference_dataset_annotation_enums(
+            ht,
+            self.reference_genome,
+            self.dataset_type,
+        )
+        ht = camelcase_array_structexpression_fields(
+            ht,
+            self.reference_genome,
+            self.dataset_type,
+        )
+        ht = ht.key_by()
+        return ht.select(
+            **get_variants_export_fields(ht, self.reference_genome, self.dataset_type),
+        )
@@ -0,0 +1,228 @@
+import hail as hl
+import luigi.worker
+import pandas as pd
+
+from v03_pipeline.lib.misc.io import remap_pedigree_hash
+from v03_pipeline.lib.model import (
+    DatasetType,
+    ReferenceGenome,
+    SampleType,
+)
+from v03_pipeline.lib.paths import (
+    project_pedigree_path,
+    variant_annotations_parquet_path,
+    variant_annotations_table_path,
+)
+from v03_pipeline.lib.tasks.exports.write_annotations_parquet import (
+    WriteVariantAnnotationsParquetTask,
+)
+from v03_pipeline.lib.test.misc import (
+    convert_ndarray_to_list,
+    copy_project_pedigree_to_mocked_dir,
+)
+from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
+
+TEST_SNV_INDEL_ANNOTATIONS = (
+    'v03_pipeline/var/test/exports/GRCh38/SNV_INDEL/annotations.ht'
+)
+TEST_PEDIGREE_3_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_3_remap.tsv'
+
+TEST_RUN_ID = 'manual__2024-04-03'
+
+
+class WriteAnnotationsParquetTest(MockedDatarootTestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        copy_project_pedigree_to_mocked_dir(
+            TEST_PEDIGREE_3_REMAP,
+            ReferenceGenome.GRCh38,
+            DatasetType.SNV_INDEL,
+            SampleType.WGS,
+            'R0113_test_project',
+        )
+        ht = hl.read_table(
+            TEST_SNV_INDEL_ANNOTATIONS,
+        )
+        ht = ht.annotate_globals(
+            updates=ht.updates.add(
+                hl.Struct(
+                    callset='fake_callset',
+                    project_guid='R0113_test_project',
+                    remap_pedigree_hash=remap_pedigree_hash(
+                        project_pedigree_path(
+                            ReferenceGenome.GRCh38,
+                            DatasetType.SNV_INDEL,
+                            SampleType.WGS,
+                            'R0113_test_project',
+                        ),
+                    ),
+                ),
+            ),
+        )
+        ht.write(
+            variant_annotations_table_path(
+                ReferenceGenome.GRCh38,
+                DatasetType.SNV_INDEL,
+            ),
+        )
+
+    def test_write_annotations_parquet_test(
+        self,
+    ) -> None:
+        worker = luigi.worker.Worker()
+        task = WriteVariantAnnotationsParquetTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WGS,
+            callset_path='fake_callset',
+            project_guids=[
+                'R0113_test_project',
+            ],
+            skip_validation=True,
+            run_id=TEST_RUN_ID,
+        )
+        worker.add(task)
+        worker.run()
+        self.assertTrue(task.output().exists())
+        self.assertTrue(task.complete())
+        df = pd.read_parquet(
+            variant_annotations_parquet_path(
+                ReferenceGenome.GRCh38,
+                DatasetType.SNV_INDEL,
+            ),
+        )
+        export_json = convert_ndarray_to_list(df.head(1).to_dict('records'))
+        export_json[0]['sortedTranscriptConsequences'] = [
+            export_json[0]['sortedTranscriptConsequences'][0],
+        ]
+        self.assertEqual(
+            export_json,
+            [
+                {
+                    'key': 0,
+                    'xpos': 1000876499,
+                    'chrom': '1',
+                    'pos': 876499,
+                    'ref': 'A',
+                    'alt': 'G',
+                    'variantId': '1-876499-A-G',
+                    'rsid': None,
+                    'CAID': 'CA502654',
+                    'liftedOverChrom': '1',
+                    'liftedOverPos': 874501,
+                    'hgmd': {'accession': 'abcdefg', 'classification': 'DFP'},
+                    'screenRegionType': None,
+                    'predictions': {
+                        'cadd': 23.5,
+                        'eigen': 2.628000020980835,
+                        'fathmm': 0.7174800038337708,
+                        'gnomad_noncoding': None,
+                        'mpc': 0.01291007362306118,
+                        'mut_pred': None,
+                        'mut_tester': 'D',
+                        'polyphen': 0.164000004529953,
+                        'primate_ai': 0.5918066501617432,
+                        'revel': 0.3109999895095825,
+                        'sift': 0.0010000000474974513,
+                        'splice_ai': 0.0,
+                        'splice_ai_consequence': 'No consequence',
+                        'vest': 0.39500001072883606,
+                    },
+                    'populations': {
+                        'exac': {
+                            'ac': 20,
+                            'af': 0.00019039999460801482,
+                            'an': 47974,
+                            'filter_af': 0.0007150234305299819,
+                            'hemi': None,
+                            'het': 20,
+                            'hom': 0,
+                        },
+                        'gnomad_exomes': {
+                            'ac': 964,
+                            'af': 0.0006690866430290043,
+                            'an': 1440770,
+                            'filter_af': 0.0008023773552849889,
+                            'hemi': 0,
+                            'hom': 0,
+                        },
+                        'gnomad_genomes': {
+                            'ac': 42,
+                            'af': 0.0002759889466688037,
+                            'an': 152180,
+                            'filter_af': 0.10000000149011612,
+                            'hemi': 0,
+                            'hom': 0,
+                        },
+                        'topmed': {
+                            'ac': 41,
+                            'af': 0.00032651599030941725,
+                            'an': 125568,
+                            'het': 41,
+                            'hom': 0,
+                        },
+                    },
+                    'sortedMotifFeatureConsequences': [
+                        {
+                            'motifFeatureId': 'ENSM00493959715',
+                            'consequenceTerms': ['TF_binding_site_variant'],
+                        },
+                    ],
+                    'sortedRegulatoryFeatureConsequences': [
+                        {
+                            'regulatoryFeatureId': 'ENSR00000344437',
+                            'biotype': 'CTCF_binding_site',
+                            'consequenceTerms': ['regulatory_region_variant'],
+                        },
+                    ],
+                    'sortedTranscriptConsequences': [
+                        {
+                            'aminoAcids': 'S/L',
+                            'canonical': 1.0,
+                            'codons': 'tCg/tTg',
+                            'geneId': 'ENSG00000187634',
+                            'hgvsc': 'ENST00000616016.5:c.1049C>T',
+                            'hgvsp': 'ENSP00000478421.2:p.Ser350Leu',
+                            'transcriptId': 'ENST00000616016',
+                            'maneSelect': 'NM_001385641.1',
+                            'manePlusClinical': None,
+                            'exon': {'index': 6, 'total': 14},
+                            'intron': None,
+                            'refseqTranscriptId': 'NM_001385641.1',
+                            'alphamissense': {'pathogenicity': None},
+                            'loftee': {'isLofNagnag': None, 'lofFilters': None},
+                            'spliceregion': {
+                                'extended_intronic_splice_region_variant': False,
+                            },
+                            'utrannotator': {
+                                'existingInframeOorfs': None,
+                                'existingOutofframeOorfs': None,
+                                'existingUorfs': None,
+                                'fiveutrAnnotation': {
+                                    'type': 'OutOfFrame_oORF',
+                                    'KozakContext': 'CGCATGC',
+                                    'KozakStrength': 'Weak',
+                                    'DistanceToCDS': 41,
+                                    'CapDistanceToStart': None,
+                                    'DistanceToStop': None,
+                                    'Evidence': None,
+                                    'AltStop': None,
+                                    'AltStopDistanceToCDS': None,
+                                    'FrameWithCDS': None,
+                                    'StartDistanceToCDS': None,
+                                    'newSTOPDistanceToCDS': None,
+                                    'alt_type': None,
+                                    'alt_type_length': None,
+                                    'ref_StartDistanceToCDS': None,
+                                    'ref_type': None,
+                                    'ref_type_length': None,
+                                },
+                                'fiveutrConsequence': None,
+                            },
+                            'biotype': 'protein_coding',
+                            'consequenceTerms': ['missense_variant'],
+                        },
+                    ],
+                },
+            ],
+        )