Skip to content

Commit

Permalink
Adding a tool to validate VCFs
Browse files Browse the repository at this point in the history
  • Loading branch information
nh13 committed Dec 8, 2020
1 parent 57988d6 commit 0015b79
Show file tree
Hide file tree
Showing 10 changed files with 776 additions and 24 deletions.
193 changes: 193 additions & 0 deletions src/main/scala/com/fulcrumgenomics/vcf/ValidateVcf.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
/*
* The MIT License
*
* Copyright (c) 2020 Fulcrum Genomics
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*/

package com.fulcrumgenomics.vcf

import com.fulcrumgenomics.FgBioDef.{PathToVcf, SafelyClosable}
import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.util.{LazyLogging, LogLevel, Logger, SimpleCounter}
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.util.{Io, ProgressLogger}
import com.fulcrumgenomics.vcf.api.VcfHeader._
import com.fulcrumgenomics.vcf.api._
import com.fulcrumgenomics.vcf.validation.GenotypeValidator.{PhaseSetGenotypeValidator, VariantFormatExtraFieldValidator, VariantFormatValidator}
import com.fulcrumgenomics.vcf.validation.VariantValidator.{VariantInfoExtraFieldValidator, VariantInfoValidator}
import com.fulcrumgenomics.vcf.validation.{ValidationResult, VariantValidator, VcfHeaderValidator, _}

import scala.collection.mutable.ArrayBuffer

@clp(group=ClpGroups.VcfOrBcf, description=
"""
|Validates a VCF.
|
|# Header
|
|Errors if:
|- Reserved INFO/FORMAT lines do not have the proper type and count
|- Duplicate contig names are found
|
|Warns if:
|- No contig lines are present
|
|# Variants
|
|When checking variants, the header is updated to use the VCF-specification reserved INFO/FORMAT field definitions.
|
|Errors if:
|- INFO/FORMAT field has the wrong type compared to the VCF header
|- INFO/FORMAT field has the wrong count compared to the VCF header
|- INFO/FORMAT field is not defined in the VCF header
|
|# Future work
|
|Validate:
|- genomic coordinate sorted
|- ID values
|- FILTER values
|- values for specific fields (ex. CIGAR)
|- values across variants (ex. phase set, spanning alleles)
|- across fields (ex. allele depth vs allele frequency, allele depth vs forward/reverse allele dpeth)
|- additional contig lines (ALT/PEDIGREE)
|- structural variant and complex re-arrangements
|- gVCFs explicitly
"""
)
class ValidateVcf
(@arg(flag='i', doc="Input VCF file.") val input: PathToVcf,
@arg(flag='l', doc="The level of issues to emit.") val level: LogLevel = LogLevel.Info,
@arg(flag='e', doc="Emit only the first N messages.") val emitFirstN: Option[Int] = None,
@arg(flag='n', doc="Process only the first N records.") val examineFirstN: Option[Int] = None,
@arg(flag='k', doc="Allow a mismatch between the actual INFO/FORMAT type in the VCF record and the type defined in the VCF header.")
val allowTypeMismatch: Boolean = false,
@arg(flag='x', doc="Allow INFO/FORMAT fields in the VCF record that are not present in the VCF header.")
val allowExtraFields: Boolean = false,
@arg(flag='s', doc="Summarize counts of messages.") val summarizeMessages: Boolean = false
) extends FgBioTool with LazyLogging {
Io.assertReadable(input)

private val levelCounter: SimpleCounter[LogLevel] = new SimpleCounter[LogLevel]()
private val messageCounter: SimpleCounter[String] = new SimpleCounter[String]()

override def execute(): Unit = {
Logger.level = this.level

val progress = new ProgressLogger(logger=logger, noun="variants", verb="examined", unit=100000)

// Validate the VCF header
val reader = VcfSource(path=input, headerTransformer=identity, allowKindMismatch=allowTypeMismatch, allowExtraFields=allowExtraFields)
val headerValidators = VcfHeaderValidator.Validators
val headerEntryValidators = VcfHeaderEntryValidator.ReservedVcfHeaderEntryValidators
headerValidators.foreach { validator => validator.validate(header=reader.header).process() }
reader.header.entries.foreach { entry =>
headerEntryValidators.foreach { validator =>
validator.validate(entry=entry).process()
}
}

// Validate the variants
val variantValidators = this.getVariantValidators(header=reader.header)
val iter = examineFirstN match {
case None => reader.view
case Some(n) => reader.take(n)
}
iter.foreach { variant: Variant =>
progress.record(variant=variant)
variantValidators.foreach(_.validate(variant=variant).process())
}
progress.logLast()
reader.safelyClose()

// Summarize and log errors
finish()
}

private implicit class ProcessValidationResults(results: Seq[ValidationResult]) {
def process(): Unit = results.foreach { result =>
val newResult = new ProcessValidationResult(result=result)
newResult.process()
}
}

private implicit class ProcessValidationResult(result: ValidationResult) {
def process(): Unit = {
levelCounter.count(result.level)
if (summarizeMessages) messageCounter.count(result.message)
if (emitFirstN.forall(levelCounter.total <= _)) result.emit(logger)
if (emitFirstN.contains(levelCounter.total)) {
logger.info(s"Reached ${levelCounter.total} messages, suppressing...")
}
}
}

private def finish(): Unit = {
if (summarizeMessages) {
messageCounter.foreach { case (message, count) =>
val extra = if (count > 1) "s" else ""
logger.info(f"Found $count%,d message$extra: $message")
}
}

levelCounter.foreach { case (level, count) =>
val extra = if (count > 1) "s" else ""
logger.info(f"Found $count%,d at $level$extra")
}
}

private def getVariantValidators(header: VcfHeader): Seq[VariantValidator] = {
val buffer = ArrayBuffer[VariantValidator]()
val reservedInfoIds = ReservedVcfInfoHeaders.map(_.id).toSet
val reservedFormatIds = ReservedVcfFormatHeaders.map(_.id).toSet
val infoKeys = header.info.keySet
val formatKeys = header.format.keySet

// Reserved INFO validators
buffer ++= VariantValidator.VariantInfoValidators

// Validators from INFOs defined in the header
buffer ++= header.infos
.filterNot(info => reservedInfoIds.contains(info.id))
.map(info => VariantInfoValidator(info=info))

// INFO fields in the record not in the header
buffer += VariantInfoExtraFieldValidator(infoKeys)

// Reserved FORMAT validators
buffer ++= GenotypeValidator.VariantFormatValidators

// Validators from FORMATs defined in the header
buffer ++= header.formats
.filterNot(format => reservedFormatIds.contains(format.id))
.map(format => VariantFormatValidator(format=format))

// Custom FORMAT validators
buffer += PhaseSetGenotypeValidator

// FORMAT fields in the record not in the header
buffer += VariantFormatExtraFieldValidator(formatKeys)

buffer.toIndexedSeq
}
}

6 changes: 4 additions & 2 deletions src/main/scala/com/fulcrumgenomics/vcf/api/Variant.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,14 @@
package com.fulcrumgenomics.vcf.api

import scala.collection.immutable.ListMap
import scala.reflect.ClassTag

object Variant {
/** Value used in VCF for values that are missing. */
val Missing: String = "."

/** Value used in VCF for values that are missing. */
val MissingChar: Char = '.'

/** Value used in arrays of ints for missing values. */
val MissingInt: Int = Int.MinValue

Expand All @@ -41,7 +43,7 @@ object Variant {
* will always return false.
*/
val MissingFloat: Float = {
import java.lang.{Long => JLong, Float => JFloat}
import java.lang.{Float => JFloat, Long => JLong}
val l = JLong.parseLong("7F800001", 16)
JFloat.intBitsToFloat(l.intValue())
}
Expand Down
63 changes: 49 additions & 14 deletions src/main/scala/com/fulcrumgenomics/vcf/api/VcfConversions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ import htsjdk.variant.vcf._

import scala.collection.JavaConverters.mapAsJavaMapConverter
import scala.collection.immutable.ListMap
import scala.collection.mutable

/**
* Object that provides methods for converting from fgbio's scala VCF classes to HTSJDK's
Expand Down Expand Up @@ -195,16 +194,41 @@ private[api] object VcfConversions {
case VcfFieldType.String => VCFHeaderLineType.String
}

private def addToBuffer(add: ((String, Any)) => Unit,
entry: Option[VcfHeaderInfoOrFormatEntry],
key: String,
value: Any,
fieldType: String,
allowKindMismatch: Boolean = false,
allowExtraFields: Boolean = false): Unit = entry match {
case None if allowExtraFields => add(key -> value)
case None => throw new IllegalStateException(s"$fieldType field $key not described in header.")
case Some(hd) =>
if (!allowKindMismatch) toTypedValue(value, hd.kind, hd.count).foreach(v => add(key -> v))
else {
try {
toTypedValue(value, hd.kind, hd.count).foreach(v => add(key -> v))
} catch {
case _: Exception => add(key -> value)
}
}
}

/**
* Converts a [[VariantContext]] and all nested classes into a [[Variant]] and set of [[Genotype]]s.
*
* @param in the [[VariantContext]] to be converted
* @param header the scala [[VcfHeader]] which contains the definitions of all the INFO and FORMAT
* fields as well as the ordered list of sample names.
* @param allowKindMismatch allow a mismatch between the actual kind and the kind defined in the VCF header
* @param allowExtraFields allow fields not defined in the VCF header
* @return a [[Variant]] instance that is a copy of the [[VariantContext]] and does not rely on it
* post-return
*/
def toScalaVariant(in: VariantContext, header: VcfHeader): Variant = try {
def toScalaVariant(in: VariantContext,
header: VcfHeader,
allowKindMismatch: Boolean = false,
allowExtraFields: Boolean = false): Variant = try {
// Build up the allele set
val scalaAlleles = in.getAlleles.iterator.map(a => Allele(a.getDisplayString)).toIndexedSeq
val alleleMap = in.getAlleles.iterator().zip(scalaAlleles).toMap
Expand Down Expand Up @@ -233,13 +257,19 @@ private[api] object VcfConversions {
if (g.hasGQ) builder += ("GQ" -> g.getGQ)
if (g.hasPL) builder += ("PL" -> g.getPL.toIndexedSeq)

g.getExtendedAttributes.keySet().foreach { key =>
val value = g.getExtendedAttribute(key)

header.format.get(key) match {
case Some(hd) => toTypedValue(value, hd.kind, hd.count).foreach(v => builder += (key -> v))
case None => throw new IllegalStateException(s"Format field $key not described in header.")
}
g.getExtendedAttributes.entrySet().foreach { entry =>
val key = entry.getKey
val value = entry.getValue

addToBuffer(
add = builder.addOne,
entry = header.format.get(key),
key = key,
value = value,
fieldType = "Format",
allowKindMismatch = allowKindMismatch,
allowExtraFields = allowExtraFields
)
}

builder.result()
Expand All @@ -255,10 +285,15 @@ private[api] object VcfConversions {
inInfo.entrySet().foreach { entry =>
val key = entry.getKey
val value = entry.getValue
header.info.get(key) match {
case Some(hd) => toTypedValue(value, hd.kind, hd.count).foreach(v => builder += (key -> v))
case None => throw new IllegalStateException(s"INFO field $key not described in header.")
}
addToBuffer(
add = builder.addOne,
entry = header.info.get(key),
key = key,
value = value,
fieldType = "Info",
allowKindMismatch = allowKindMismatch,
allowExtraFields = allowExtraFields
)
}

builder.result()
Expand All @@ -282,7 +317,7 @@ private[api] object VcfConversions {
)
}
catch {
case ex: Throwable => throw new RuntimeException(s"Failed to convert variant: ${in}", ex)
case ex: Throwable => throw new RuntimeException(s"Failed to convert variant: $in", ex)
}

/**
Expand Down
Loading

0 comments on commit 0015b79

Please sign in to comment.