Skip to content

Commit

Permalink
improvement(clinvcf): ✨ add varianttype and vraiantlength
Browse files Browse the repository at this point in the history
  • Loading branch information
MelanieBroutin committed Nov 20, 2020
1 parent 8462010 commit 4aa60d3
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 3 deletions.
20 changes: 18 additions & 2 deletions src/clinvcf.nim
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ type
molecular_consequences: seq[MolecularConsequence]
submissions: seq[Submission]
pathologies: TableRef[string, seq[string]]
type_v: string
length: int

const ignoredPathoTag = @["not specified", "see cases", "not provided", "variant of unknown significance"]

Expand Down Expand Up @@ -561,6 +563,7 @@ proc loadVariants*(clinvar_xml_file: string, genome_assembly: string): tuple[var
# TODO: We should have a more pretty way to do it as the MeasureSet in this case is bellow the GenotypeSet:
# <GenotypeSet Type="CompoundHeterozygote" ID="424779" Acc="VCV000424779" Version="1">
# <MeasureSet Type="Variant" ID="928" Acc="VCV000000928" Version="2" NumberOfChromosomes="1">
# <Measure Type="single nucleotide variant" ID="46341">
if reference_clinvar_assertion_nodes[0].select("genotypeset").len() > 0:
continue

Expand Down Expand Up @@ -594,13 +597,20 @@ proc loadVariants*(clinvar_xml_file: string, genome_assembly: string): tuple[var
pos_string = sequence_loc.attr("positionVCF")
ref_allele = sequence_loc.attr("referenceAlleleVCF")
alt_allele = sequence_loc.attr("alternateAlleleVCF")

type_v = measure_node.attr("Type")
start = sequence_loc.attr("start")
stop = sequence_loc.attr("stop")

var
pos : int = -1
length = sequence_loc.attr("variantLength")

if pos_string != "":
pos = pos_string.parseInt()

if length == "":
length = $(parseInt(stop) - parseInt(start))

# Parse dbSNP rsid
# FIXME: Use this kind of loop to replace q calls and only explore first line childs in loops !!!
# <XRef Type="rs" ID="846664" DB="dbSNP"/>
Expand All @@ -620,7 +630,9 @@ proc loadVariants*(clinvar_xml_file: string, genome_assembly: string): tuple[var
rsid: cast[int32](rsid),
ref_allele: ref_allele,
alt_allele: alt_allele,
pathologies: newTable[string, seq[string]]()
pathologies: newTable[string, seq[string]](),
type_v: type_v,
length: cast[int](length)
)
result.variants[variant_id] = variant

Expand Down Expand Up @@ -810,6 +822,8 @@ proc printVCF*(variants: seq[ClinVariant], genome_assembly: string, filedate: st
# ##INFO=<ID=ORIGIN,Number=.,Type=String,Description="Allele origin. One or more of the following values may be added: 0 - unknown; 1 - germline; 2 - somatic; 4 - inherited; 8 - paternal; 16 - maternal; 3
# 2 - de-novo; 64 - biparental; 128 - uniparental; 256 - not-tested; 512 - tested-inconclusive; 1073741824 - other">
echo "##INFO=<ID=RS,Number=.,Type=String,Description=\"dbSNP ID (i.e. rs number)\">"
echo "##INFO=<ID=VARIANTTYPE,Number=.,Type=String,Description=\"Type of variant\">"
echo "##INFO=<ID=VARIANTLENGTH,Number=.,Type=Integer,Description=\"Lentgh of variant\">"
# ##INFO=<ID=SSR,Number=1,Type=Integer,Description="Variant Suspect Reason Codes. One or more of the following values may be added: 0 - unspecified, 1 - Paralog, 2 - byEST, 4 - oldAlign, 8 - Para_EST, 16
# - 1kg_failed, 1024 - other">
echo "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"
Expand All @@ -836,6 +850,8 @@ proc printVCF*(variants: seq[ClinVariant], genome_assembly: string, filedate: st

info_fields.add("CLNSIG=" & clinsig.formatVCFString())
info_fields.add("CLNREVSTAT=" & revstat.formatVCFString())
info_fields.add("VARIANTTYPE=" & v.type_v.formatVCFString())
info_fields.add("VARIANTLENGTH=" & $v.length)

if old_clinsig != "":
info_fields.add("OLD_CLNSIG=" & old_clinsig.formatVCFString())
Expand Down
39 changes: 38 additions & 1 deletion tests/functional-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,22 @@ assert_in_stdout "GENEINFO=BRCA2:675"
assert_in_stdout "CLNREVSTAT=criteria_provided,_conflicting_interpretations"
assert_in_stdout "MC=SO:0001583|missense_variant"
assert_in_stdout "RS=80358507"
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"

# Check integration of NCBI clinsig conversion
run ncbi_clnsig_conversion $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/109.xml
assert_exit_code 0
assert_in_stdout "CLNSIG=Likely_pathogenic,_risk_factor"
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"

# Multiple submission from same submitter
run mutli_subs_from_same_submitter $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/307134.xml
assert_exit_code 0
assert_in_stdout "CLNREVSTAT=criteria_provided,_single_submitter"
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"

# Multiple submission from same submitter
run skip_het_compound $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/928.xml
Expand All @@ -38,64 +44,87 @@ assert_in_stdout "CLNSIG=Likely_pathogenic"
run same_submitter_conflict $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/1166.xml
assert_exit_code 0
assert_in_stdout "CLNREVSTAT=criteria_provided,_conflicting_interpretations"
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"

# Multiple 3-4 stars subs, take them all !!! (see case 7108)
run run_multiple_3_4_star_subs $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/7108.xml
assert_exit_code 0
assert_in_stdout "CLNSIG=Pathogenic,_drug_response"
assert_in_stdout "CLNREVSTAT=reviewed_by_expert_panel"
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"

# Sort non-ACMG clnsig lexicographically
run sort_non_acmg_cnlsig_tags $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/5333.xml
assert_exit_code 0
assert_in_stdout "CLNSIG=Affects,_risk_factor"
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"

# Sort non-ACMG clnsig lexicographically
run expert_panel $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/582.xml
assert_exit_code 0
assert_in_stdout "CLNSIG=Pathogenic;"
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"

# Correction of conflicting interpretation
run conflict_deciphering $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/9.xml
assert_exit_code 0
assert_in_stdout "CLNSIG=Pathogenic"
assert_in_stdout "OLD_CLNSIG=Conflicting_interpretations_of_pathogenicity"
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"

# Handle multiple gene and select the prefered one from HGVS
run multi_gene_selection $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/TREX1.gff $grch37_version tests/files/225499.xml
assert_exit_code 0
assert_in_stdout "GENEINFO=TREX1:11277"
assert_in_stdout "VARIANTTYPE=Duplication"
assert_in_stdout "VARIANTLENGTH=1"

# Handle mutliple gene and select the prefered based on submissions (HGVS has no gene)
run multi_gene_selection $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/CFTR.gff $grch37_version tests/files/618897_2019-05.xml
assert_exit_code 0
assert_in_stdout "GENEINFO=CFTR:1080"
assert_in_stdout "VARIANTTYPE=Deletion"
assert_in_stdout "VARIANTLENGTH=1"

# Mitochondrial annotations
run mito_anno $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/MT.gff $grch37_version tests/files/9618.xml
assert_exit_code 0
assert_in_stdout "GENEINFO=TRNE:4556"
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"

# Coding first option for gene anno to force using protein coding annotation
run coding_first_control $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/ADORA2A.gff $grch37_version tests/files/225974.xml
assert_exit_code 0
assert_in_stdout "GENEINFO=ADORA2A:135"
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"
run coding_first_option $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/ADORA2A.gff --coding-first $grch37_version tests/files/225974.xml
assert_exit_code 0
assert_in_stdout "GENEINFO=ADORA2A:135"

assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"
# Consider close exonic regions (20bp padding) as exonic in gene priorization module
# In this case variant is in FTCD-AS1 (protein-coding) exon but 9bp away from FTCD.
# We discriminate these two gene using the gene_id of FTCD that is smaller that FTCD-AS1
run close_exonic_region $exe --hgnc tests/files/hgnc_toy.tsv --gff tests/files/FTCD.gff $grch37_version tests/files/340430.xml
assert_exit_code 0
assert_in_stdout "GENEINFO=FTCD:10841"
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"

# For antivariant (same as the reference)
# We use the "." for the alternate allele representation
run antivariant $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/242771.xml
assert_exit_code 0
assert_in_stdout "22 42523943 242771 A ."
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"

# Haplotypes should not be exported in the VCF
run haplotype $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/16895.xml
Expand All @@ -106,17 +135,25 @@ run three_star_reclassification $exe --hgnc tests/files/hgnc_toy.tsv $grch37_ver
assert_exit_code 0
assert_in_stdout "CLNSIG=Pathogenic/Likely_pathogenic"
assert_in_stdout "CLNRECSTAT=3"
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"

run two_star_reclassification $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/140866.xml
assert_exit_code 0
assert_in_stdout "CLNSIG=Likely_pathogenic"
assert_in_stdout "CLNRECSTAT=2"
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"

run one_star_reclassification $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/182965.xml
assert_exit_code 0
assert_in_stdout "CLNRECSTAT=1"
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"

# Pathology parsing
run pathology_field_parsing $exe --hgnc tests/files/hgnc_toy.tsv $grch37_version tests/files/109.xml
assert_exit_code 0
assert_in_stdout "CLNDISEASE=pheochromocytoma_susceptibility_to|pheochromocytoma"
assert_in_stdout "VARIANTTYPE=single_nucleotide_variant"
assert_in_stdout "VARIANTLENGTH=1"

0 comments on commit 4aa60d3

Please sign in to comment.