diff --git a/src/main/scala/com/fulcrumgenomics/vcf/ValidateVcf.scala b/src/main/scala/com/fulcrumgenomics/vcf/ValidateVcf.scala new file mode 100644 index 000000000..8d2c9e3a1 --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/vcf/ValidateVcf.scala @@ -0,0 +1,193 @@ +/* + * The MIT License + * + * Copyright (c) 2020 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +package com.fulcrumgenomics.vcf + +import com.fulcrumgenomics.FgBioDef.{PathToVcf, SafelyClosable} +import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} +import com.fulcrumgenomics.commons.util.{LazyLogging, LogLevel, Logger, SimpleCounter} +import com.fulcrumgenomics.sopt.{arg, clp} +import com.fulcrumgenomics.util.{Io, ProgressLogger} +import com.fulcrumgenomics.vcf.api.VcfHeader._ +import com.fulcrumgenomics.vcf.api._ +import com.fulcrumgenomics.vcf.validation.GenotypeValidator.{PhaseSetGenotypeValidator, VariantFormatExtraFieldValidator, VariantFormatValidator} +import com.fulcrumgenomics.vcf.validation.VariantValidator.{VariantInfoExtraFieldValidator, VariantInfoValidator} +import com.fulcrumgenomics.vcf.validation.{ValidationResult, VariantValidator, VcfHeaderValidator, _} + +import scala.collection.mutable.ArrayBuffer + +@clp(group=ClpGroups.VcfOrBcf, description= + """ + |Validates a VCF. + | + |# Header + | + |Errors if: + |- Reserved INFO/FORMAT lines do not have the proper type and count + |- Duplicate contig names are found + | + |Warns if: + |- No contig lines are present + | + |# Variants + | + |When checking variants, the header is updated to use the VCF-specification reserved INFO/FORMAT field definitions. + | + |Errors if: + |- INFO/FORMAT field has the wrong type compared to the VCF header + |- INFO/FORMAT field has the wrong count compared to the VCF header + |- INFO/FORMAT field is not defined in the VCF header + | + |# Future work + | + |Validate: + |- genomic coordinate sorted + |- ID values + |- FILTER values + |- values for specific fields (ex. CIGAR) + |- values across variants (ex. phase set, spanning alleles) + |- across fields (ex. allele depth vs allele frequency, allele depth vs forward/reverse allele dpeth) + |- additional contig lines (ALT/PEDIGREE) + |- structural variant and complex re-arrangements + |- gVCFs explicitly + """ +) +class ValidateVcf +(@arg(flag='i', doc="Input VCF file.") val input: PathToVcf, + @arg(flag='l', doc="The level of issues to emit.") val level: LogLevel = LogLevel.Info, + @arg(flag='e', doc="Emit only the first N messages.") val emitFirstN: Option[Int] = None, + @arg(flag='n', doc="Process only the first N records.") val examineFirstN: Option[Int] = None, + @arg(flag='k', doc="Allow a mismatch between the actual INFO/FORMAT type in the VCF record and the type defined in the VCF header.") + val allowTypeMismatch: Boolean = false, + @arg(flag='x', doc="Allow INFO/FORMAT fields in the VCF record that are not present in the VCF header.") + val allowExtraFields: Boolean = false, + @arg(flag='s', doc="Summarize counts of messages.") val summarizeMessages: Boolean = false +) extends FgBioTool with LazyLogging { + Io.assertReadable(input) + + private val levelCounter: SimpleCounter[LogLevel] = new SimpleCounter[LogLevel]() + private val messageCounter: SimpleCounter[String] = new SimpleCounter[String]() + + override def execute(): Unit = { + Logger.level = this.level + + val progress = new ProgressLogger(logger=logger, noun="variants", verb="examined", unit=100000) + + // Validate the VCF header + val reader = VcfSource(path=input, headerTransformer=identity, allowKindMismatch=allowTypeMismatch, allowExtraFields=allowExtraFields) + val headerValidators = VcfHeaderValidator.Validators + val headerEntryValidators = VcfHeaderEntryValidator.ReservedVcfHeaderEntryValidators + headerValidators.foreach { validator => validator.validate(header=reader.header).process() } + reader.header.entries.foreach { entry => + headerEntryValidators.foreach { validator => + validator.validate(entry=entry).process() + } + } + + // Validate the variants + val variantValidators = this.getVariantValidators(header=reader.header) + val iter = examineFirstN match { + case None => reader.view + case Some(n) => reader.take(n) + } + iter.foreach { variant: Variant => + progress.record(variant=variant) + variantValidators.foreach(_.validate(variant=variant).process()) + } + progress.logLast() + reader.safelyClose() + + // Summarize and log errors + finish() + } + + private implicit class ProcessValidationResults(results: Seq[ValidationResult]) { + def process(): Unit = results.foreach { result => + val newResult = new ProcessValidationResult(result=result) + newResult.process() + } + } + + private implicit class ProcessValidationResult(result: ValidationResult) { + def process(): Unit = { + levelCounter.count(result.level) + if (summarizeMessages) messageCounter.count(result.message) + if (emitFirstN.forall(levelCounter.total <= _)) result.emit(logger) + if (emitFirstN.contains(levelCounter.total)) { + logger.info(s"Reached ${levelCounter.total} messages, suppressing...") + } + } + } + + private def finish(): Unit = { + if (summarizeMessages) { + messageCounter.foreach { case (message, count) => + val extra = if (count > 1) "s" else "" + logger.info(f"Found $count%,d message$extra: $message") + } + } + + levelCounter.foreach { case (level, count) => + val extra = if (count > 1) "s" else "" + logger.info(f"Found $count%,d at $level$extra") + } + } + + private def getVariantValidators(header: VcfHeader): Seq[VariantValidator] = { + val buffer = ArrayBuffer[VariantValidator]() + val reservedInfoIds = ReservedVcfInfoHeaders.map(_.id).toSet + val reservedFormatIds = ReservedVcfFormatHeaders.map(_.id).toSet + val infoKeys = header.info.keySet + val formatKeys = header.format.keySet + + // Reserved INFO validators + buffer ++= VariantValidator.VariantInfoValidators + + // Validators from INFOs defined in the header + buffer ++= header.infos + .filterNot(info => reservedInfoIds.contains(info.id)) + .map(info => VariantInfoValidator(info=info)) + + // INFO fields in the record not in the header + buffer += VariantInfoExtraFieldValidator(infoKeys) + + // Reserved FORMAT validators + buffer ++= GenotypeValidator.VariantFormatValidators + + // Validators from FORMATs defined in the header + buffer ++= header.formats + .filterNot(format => reservedFormatIds.contains(format.id)) + .map(format => VariantFormatValidator(format=format)) + + // Custom FORMAT validators + buffer += PhaseSetGenotypeValidator + + // FORMAT fields in the record not in the header + buffer += VariantFormatExtraFieldValidator(formatKeys) + + buffer.toIndexedSeq + } +} + diff --git a/src/main/scala/com/fulcrumgenomics/vcf/api/Variant.scala b/src/main/scala/com/fulcrumgenomics/vcf/api/Variant.scala index 451d78e1e..ddd189c34 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/api/Variant.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/api/Variant.scala @@ -25,12 +25,14 @@ package com.fulcrumgenomics.vcf.api import scala.collection.immutable.ListMap -import scala.reflect.ClassTag object Variant { /** Value used in VCF for values that are missing. */ val Missing: String = "." + /** Value used in VCF for values that are missing. */ + val MissingChar: Char = '.' + /** Value used in arrays of ints for missing values. */ val MissingInt: Int = Int.MinValue @@ -41,7 +43,7 @@ object Variant { * will always return false. */ val MissingFloat: Float = { - import java.lang.{Long => JLong, Float => JFloat} + import java.lang.{Float => JFloat, Long => JLong} val l = JLong.parseLong("7F800001", 16) JFloat.intBitsToFloat(l.intValue()) } diff --git a/src/main/scala/com/fulcrumgenomics/vcf/api/VcfConversions.scala b/src/main/scala/com/fulcrumgenomics/vcf/api/VcfConversions.scala index 94208cfbd..4897ca102 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/api/VcfConversions.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/api/VcfConversions.scala @@ -36,7 +36,6 @@ import htsjdk.variant.vcf._ import scala.collection.JavaConverters.mapAsJavaMapConverter import scala.collection.immutable.ListMap -import scala.collection.mutable /** * Object that provides methods for converting from fgbio's scala VCF classes to HTSJDK's @@ -195,16 +194,41 @@ private[api] object VcfConversions { case VcfFieldType.String => VCFHeaderLineType.String } + private def addToBuffer(add: ((String, Any)) => Unit, + entry: Option[VcfHeaderInfoOrFormatEntry], + key: String, + value: Any, + fieldType: String, + allowKindMismatch: Boolean = false, + allowExtraFields: Boolean = false): Unit = entry match { + case None if allowExtraFields => add(key -> value) + case None => throw new IllegalStateException(s"$fieldType field $key not described in header.") + case Some(hd) => + if (!allowKindMismatch) toTypedValue(value, hd.kind, hd.count).foreach(v => add(key -> v)) + else { + try { + toTypedValue(value, hd.kind, hd.count).foreach(v => add(key -> v)) + } catch { + case _: Exception => add(key -> value) + } + } + } + /** * Converts a [[VariantContext]] and all nested classes into a [[Variant]] and set of [[Genotype]]s. * * @param in the [[VariantContext]] to be converted * @param header the scala [[VcfHeader]] which contains the definitions of all the INFO and FORMAT * fields as well as the ordered list of sample names. + * @param allowKindMismatch allow a mismatch between the actual kind and the kind defined in the VCF header + * @param allowExtraFields allow fields not defined in the VCF header * @return a [[Variant]] instance that is a copy of the [[VariantContext]] and does not rely on it * post-return */ - def toScalaVariant(in: VariantContext, header: VcfHeader): Variant = try { + def toScalaVariant(in: VariantContext, + header: VcfHeader, + allowKindMismatch: Boolean = false, + allowExtraFields: Boolean = false): Variant = try { // Build up the allele set val scalaAlleles = in.getAlleles.iterator.map(a => Allele(a.getDisplayString)).toIndexedSeq val alleleMap = in.getAlleles.iterator().zip(scalaAlleles).toMap @@ -233,13 +257,19 @@ private[api] object VcfConversions { if (g.hasGQ) builder += ("GQ" -> g.getGQ) if (g.hasPL) builder += ("PL" -> g.getPL.toIndexedSeq) - g.getExtendedAttributes.keySet().foreach { key => - val value = g.getExtendedAttribute(key) - - header.format.get(key) match { - case Some(hd) => toTypedValue(value, hd.kind, hd.count).foreach(v => builder += (key -> v)) - case None => throw new IllegalStateException(s"Format field $key not described in header.") - } + g.getExtendedAttributes.entrySet().foreach { entry => + val key = entry.getKey + val value = entry.getValue + + addToBuffer( + add = builder.addOne, + entry = header.format.get(key), + key = key, + value = value, + fieldType = "Format", + allowKindMismatch = allowKindMismatch, + allowExtraFields = allowExtraFields + ) } builder.result() @@ -255,10 +285,15 @@ private[api] object VcfConversions { inInfo.entrySet().foreach { entry => val key = entry.getKey val value = entry.getValue - header.info.get(key) match { - case Some(hd) => toTypedValue(value, hd.kind, hd.count).foreach(v => builder += (key -> v)) - case None => throw new IllegalStateException(s"INFO field $key not described in header.") - } + addToBuffer( + add = builder.addOne, + entry = header.info.get(key), + key = key, + value = value, + fieldType = "Info", + allowKindMismatch = allowKindMismatch, + allowExtraFields = allowExtraFields + ) } builder.result() @@ -282,7 +317,7 @@ private[api] object VcfConversions { ) } catch { - case ex: Throwable => throw new RuntimeException(s"Failed to convert variant: ${in}", ex) + case ex: Throwable => throw new RuntimeException(s"Failed to convert variant: $in", ex) } /** diff --git a/src/main/scala/com/fulcrumgenomics/vcf/api/VcfHeader.scala b/src/main/scala/com/fulcrumgenomics/vcf/api/VcfHeader.scala index ecb01f089..b53e609b3 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/api/VcfHeader.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/api/VcfHeader.scala @@ -71,6 +71,13 @@ object VcfFieldType extends FgBioEnum[VcfFieldType] { /** Trait representing an entry/line in a VCF header. */ sealed trait VcfHeaderEntry {} +sealed trait VcfHeaderInfoOrFormatEntry extends VcfHeaderEntry { + def id: String + def count: VcfCount + def kind: VcfFieldType + def description: String +} + // TODO add subclasses for ALT and PEDIGREE lines /** A contig line/entry in the VCF header. @@ -103,7 +110,7 @@ case class VcfInfoHeader(id: String, description: String, source: Option[String] = None, version: Option[String] = None - ) extends VcfHeaderEntry {} + ) extends VcfHeaderInfoOrFormatEntry {} /** @@ -117,7 +124,7 @@ case class VcfInfoHeader(id: String, case class VcfFormatHeader(id: String, count: VcfCount, kind: VcfFieldType, - description: String) extends VcfHeaderEntry {} + description: String) extends VcfHeaderInfoOrFormatEntry {} /** @@ -176,3 +183,54 @@ case class VcfHeader(contigs: IndexedSeq[VcfContigHeader], /** How many lines total are in the header. */ def size: Int = entries.size } + +object VcfHeader { + + import com.fulcrumgenomics.vcf.api.VcfCount._ + import com.fulcrumgenomics.vcf.api.{VcfFieldType => Kind} + + private implicit def countToFixedVcfCount(count: Int): VcfCount.Fixed = VcfCount.Fixed(count) + + val ReservedVcfInfoHeaders: Seq[VcfInfoHeader] = IndexedSeq( + VcfInfoHeader(id="AA", count=1, kind=Kind.String, description=""), + VcfInfoHeader(id="AC", count=OnePerAltAllele, kind=Kind.Integer, description=""), + VcfInfoHeader(id="AD", count=OnePerAllele, kind=Kind.Integer, description=""), + VcfInfoHeader(id="ADF", count=OnePerAllele, kind=Kind.Integer, description=""), + VcfInfoHeader(id="ADR", count=OnePerAllele, kind=Kind.Integer, description=""), + VcfInfoHeader(id="AF", count=OnePerAltAllele, kind=Kind.Float, description=""), + VcfInfoHeader(id="AN", count=1, kind=Kind.Integer, description=""), + VcfInfoHeader(id="BQ", count=1, kind=Kind.Integer, description=""), + VcfInfoHeader(id="CIGAR", count=OnePerAltAllele, kind=Kind.String, description=""), + VcfInfoHeader(id="DB", count=0, kind=Kind.Flag, description=""), + VcfInfoHeader(id="DP", count=1, kind=Kind.Integer, description=""), + VcfInfoHeader(id="END", count=1, kind=Kind.Integer, description=""), + VcfInfoHeader(id="H2", count=0, kind=Kind.Flag, description=""), + VcfInfoHeader(id="H3", count=0, kind=Kind.Flag, description=""), + VcfInfoHeader(id="MQ", count=1, kind=Kind.Float, description=""), + VcfInfoHeader(id="MQ0", count=1, kind=Kind.Integer, description=""), + VcfInfoHeader(id="NS", count=1, kind=Kind.Integer, description=""), + VcfInfoHeader(id="SB", count=4, kind=Kind.Integer, description=""), + VcfInfoHeader(id="SOMATIC", count=0, kind=Kind.Flag, description=""), + VcfInfoHeader(id="VALIDATED", count=0, kind=Kind.Flag, description=""), + VcfInfoHeader(id="1000G", count=0, kind=Kind.Flag, description=""), + ) + + val ReservedVcfFormatHeaders: Seq[VcfFormatHeader] = IndexedSeq( + VcfFormatHeader(id="AD", count=OnePerAllele, kind=Kind.Integer, description=""), + VcfFormatHeader(id="ADF", count=OnePerAllele, kind=Kind.Integer, description=""), + VcfFormatHeader(id="ADR", count=OnePerAllele, kind=Kind.Integer, description=""), + VcfFormatHeader(id="DP", count=1, kind=Kind.Integer, description=""), + VcfFormatHeader(id="EC", count=OnePerAltAllele, kind=Kind.Integer, description=""), + VcfFormatHeader(id="FT", count=1, kind=Kind.String, description=""), + VcfFormatHeader(id="GL", count=OnePerGenotype, kind=Kind.Float, description=""), + VcfFormatHeader(id="GP", count=OnePerGenotype, kind=Kind.Float, description=""), + VcfFormatHeader(id="GQ", count=1, kind=Kind.Integer, description=""), + VcfFormatHeader(id="GT", count=1, kind=Kind.String, description=""), + VcfFormatHeader(id="HQ", count=2, kind=Kind.Integer, description=""), + VcfFormatHeader(id="MQ", count=1, kind=Kind.Integer, description=""), + VcfFormatHeader(id="PL", count=OnePerGenotype, kind=Kind.Integer, description=""), + VcfFormatHeader(id="PP", count=OnePerGenotype, kind=Kind.Integer, description=""), + VcfFormatHeader(id="PQ", count=1, kind=Kind.Integer, description=""), + VcfFormatHeader(id="PS", count=1, kind=Kind.Integer, description=""), + ) +} diff --git a/src/main/scala/com/fulcrumgenomics/vcf/api/VcfSource.scala b/src/main/scala/com/fulcrumgenomics/vcf/api/VcfSource.scala index adf4de985..3ce07f30e 100644 --- a/src/main/scala/com/fulcrumgenomics/vcf/api/VcfSource.scala +++ b/src/main/scala/com/fulcrumgenomics/vcf/api/VcfSource.scala @@ -40,11 +40,15 @@ import htsjdk.variant.vcf.{VCFCodec, VCFFileReader, VCFHeader} * for iterating over the entire stream of variants as well as querying by genomic location if an * index is present. * - * @param reader the underlying HTSJDK [[VCFFileReader]] + * @param reader the underlying HTSJDK [[VCFFileReader]]. * @param headerTransformer a method to transform the header after being read in. + * @param allowKindMismatch allow a mismatch between the actual kind and the kind defined in the VCF header. + * @param allowExtraFields allow fields not defined in the VCF header. */ class VcfSource private(private val reader: AbstractFeatureReader[VariantContext, _], - private val headerTransformer: VcfHeader => VcfHeader) extends View[Variant] with Closeable { + private val headerTransformer: VcfHeader => VcfHeader, + allowKindMismatch: Boolean = false, + allowExtraFields: Boolean = false) extends View[Variant] with Closeable { /** The header associated with the VCF being read. */ val header: VcfHeader = headerTransformer(VcfConversions.toScalaHeader(reader.getHeader.asInstanceOf[VCFHeader])) @@ -58,7 +62,7 @@ class VcfSource private(private val reader: AbstractFeatureReader[VariantContext /** Wraps an iterator provided by HTSJDK into a SelfClosingIterator that transforms VariantContexts into Variants. */ private def wrap(it: CloseableIterator[VariantContext]): VariantIterator = { new SelfClosingIterator( - iter = it.map(vc => VcfConversions.toScalaVariant(vc, header)), + iter = it.map(vc => VcfConversions.toScalaVariant(vc, header, allowKindMismatch=allowKindMismatch, allowExtraFields=allowExtraFields)), closer = () => it.close()) } @@ -93,7 +97,7 @@ object VcfSource { * Manufactures a variant source for reading from the specified path. The index, if one exists, will be * auto-discovered based on the path to the VCF. * - * @param path the path to a VCF, gzipped VCF or BCF file + * @param path the path to a VCF, gzipped VCF or BCF file * @return a VariantSource for reading from the path given */ def apply(path: PathToVcf): VcfSource = { @@ -106,9 +110,14 @@ object VcfSource { * * @param path the path to a VCF, gzipped VCF or BCF file * @param headerTransformer a method to transform the header after being read in + * @param allowKindMismatch allow a mismatch between the actual kind and the kind defined in the VCF header. + * @param allowExtraFields allow fields not defined in the VCF header. * @return a VariantSource for reading from the path given */ - def apply(path: PathToVcf, headerTransformer: VcfHeader => VcfHeader): VcfSource = { + def apply(path: PathToVcf, + headerTransformer: VcfHeader => VcfHeader, + allowKindMismatch: Boolean = false, + allowExtraFields: Boolean = false): VcfSource = { val codec = if (PathUtil.extensionOf(path).contains(".bcf")) { new BCF2Codec } @@ -119,7 +128,13 @@ object VcfSource { } val reader = AbstractFeatureReader.getFeatureReader(path.toUri.toString, codec, false) - new VcfSource(reader, headerTransformer=headerTransformer) + + new VcfSource( + reader = reader, + headerTransformer = headerTransformer, + allowKindMismatch = allowKindMismatch, + allowExtraFields = allowExtraFields + ) } } diff --git a/src/main/scala/com/fulcrumgenomics/vcf/validation/GenotypeValidator.scala b/src/main/scala/com/fulcrumgenomics/vcf/validation/GenotypeValidator.scala new file mode 100644 index 000000000..270bb04ca --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/vcf/validation/GenotypeValidator.scala @@ -0,0 +1,81 @@ +/* + * The MIT License + * + * Copyright (c) 2020 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +package com.fulcrumgenomics.vcf.validation + +import com.fulcrumgenomics.vcf.api.{Genotype, Variant, VcfFormatHeader} +import com.fulcrumgenomics.vcf.api.VcfHeader.ReservedVcfFormatHeaders +import com.fulcrumgenomics.vcf.validation.ValidationResult.error + + +sealed trait GenotypeValidator extends VariantValidator { + final def validate(variant: Variant): Seq[ValidationResult] = { + variant.genotypes.values.flatMap { genotype => validate(variant=variant, genotype=genotype) }.toIndexedSeq + } + def validate(variant: Variant, genotype: Genotype): Seq[ValidationResult] +} + +object GenotypeValidator { + val VariantFormatValidators: Seq[VariantFormatValidator] = ReservedVcfFormatHeaders.map(VariantFormatValidator.apply) + + case class VariantFormatValidator(format: VcfFormatHeader) extends GenotypeValidator { + def validate(variant: Variant, genotype: Genotype): Seq[ValidationResult] = { + genotype.get[Any](format.id).toIndexedSeq.flatMap { value => + this.validate(kind=format.kind, count=format.count, variant=variant, genotype=Some(genotype), source=s"FORMAT.${format.id}", value=value) + } + } + } + + case object PhaseSetGenotypeValidator extends GenotypeValidator { + def validate(variant: Variant, genotype: Genotype): Seq[ValidationResult] = { + (genotype.phased, genotype.get[Any]("PS")) match { + case (false, None) => Seq.empty // OK + case (true, Some(value: Int)) => Seq.empty // OK + case (false, Some(value)) => Seq(error( + s"FORMAT.PS genotype was unphased but had a phase set value `$value`", variant=variant, genotype=genotype + )) + case (true, None) => Seq(error( + s"FORMAT.PS genotype was phased but had no phase set value", variant=variant, genotype=genotype + )) + case (true, Some(value)) => Seq(error( + s"FORMAT.PS genotype was phased but had a non-integer phase set value `${value.getClass.getSimpleName}", variant=variant, genotype=genotype + )) + } + } + } + + case class VariantFormatExtraFieldValidator(formatKeys: Set[String]) extends VariantValidator { + def validate(variant: Variant): Seq[ValidationResult] = { + // check for FORMAT values not described in the header + variant.genotypes + .flatMap { case (_, genotype) => genotype.attrs.keySet } + .toSet + .diff(formatKeys) + .map { id: String => error(s"FORMAT.$id found in record but missing in header", variant=variant) } + .toIndexedSeq + } + } +} + diff --git a/src/main/scala/com/fulcrumgenomics/vcf/validation/ValidationResult.scala b/src/main/scala/com/fulcrumgenomics/vcf/validation/ValidationResult.scala new file mode 100644 index 000000000..adf6b419e --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/vcf/validation/ValidationResult.scala @@ -0,0 +1,79 @@ +/* + * The MIT License + * + * Copyright (c) 2020 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +package com.fulcrumgenomics.vcf.validation + +import com.fulcrumgenomics.commons.util.{LogLevel, Logger} +import com.fulcrumgenomics.vcf.api.{Genotype, Variant} + + +case class ValidationResult(message: String, + variant: Option[Variant] = None, + genotype: Option[Genotype] = None, + level: LogLevel = LogLevel.Error) { + def fullMessage: String = { + val builder = new StringBuilder() + variant.foreach(v => builder.append(f"${v.chrom}:${v.pos} ")) + genotype.foreach(g => builder.append(f"${g.sample} ")) + builder.append(this.message) + builder.toString() + } + + def emit(print: String => Unit): Unit = print(this.message) + def emit(logger: Logger): Unit = this._emit(logger) + val _emit: Logger => Unit = this.level match { + case LogLevel.Debug => (logger: Logger) => logger.debug(this.fullMessage) + case LogLevel.Info => (logger: Logger) => logger.info(this.fullMessage) + case LogLevel.Warning => (logger: Logger) => logger.warning(this.fullMessage) + case LogLevel.Error => (logger: Logger) => logger.error(this.fullMessage) + case LogLevel.Fatal => (logger: Logger) => logger.fatal(this.fullMessage) + } +} + +object ValidationResult { + def debug(message: String, variant: Option[Variant] = None, genotype: Option[Genotype] = None): ValidationResult = { + ValidationResult(variant=variant, message=message, genotype=genotype, level=LogLevel.Debug) + } + def debug(message: String, variant: Variant): ValidationResult = debug(message=message, variant=Some(variant)) + def debug(message: String, variant: Variant, genotype: Genotype): ValidationResult = debug(message=message, variant=Some(variant), genotype=Some(genotype)) + + def info(message: String, variant: Option[Variant] = None, genotype: Option[Genotype] = None): ValidationResult = { + ValidationResult(variant=variant, message=message, genotype=genotype, level=LogLevel.Info) + } + def info(message: String, variant: Variant): ValidationResult = info(message=message, variant=Some(variant)) + def info(message: String, variant: Variant, genotype: Genotype): ValidationResult = info(message=message, variant=Some(variant), genotype=Some(genotype)) + + def warning(message: String, variant: Option[Variant] = None, genotype: Option[Genotype] = None): ValidationResult = { + ValidationResult(variant=variant, message=message, genotype=genotype, level=LogLevel.Warning) + } + def warning(message: String, variant: Variant): ValidationResult = warning(message=message, variant=Some(variant)) + def warning(message: String, variant: Variant, genotype: Genotype): ValidationResult = warning(message=message, variant=Some(variant), genotype=Some(genotype)) + + def error(message: String, variant: Option[Variant] = None, genotype: Option[Genotype] = None): ValidationResult = { + ValidationResult(variant=variant, message=message, genotype=genotype, level=LogLevel.Error) + } + def error(message: String, variant: Variant): ValidationResult = error(message=message, variant=Some(variant)) + def error(message: String, variant: Variant, genotype: Genotype): ValidationResult = error(message=message, variant=Some(variant), genotype=Some(genotype)) +} \ No newline at end of file diff --git a/src/main/scala/com/fulcrumgenomics/vcf/validation/VariantValidator.scala b/src/main/scala/com/fulcrumgenomics/vcf/validation/VariantValidator.scala new file mode 100644 index 000000000..abc6d8616 --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/vcf/validation/VariantValidator.scala @@ -0,0 +1,134 @@ +/* + * The MIT License + * + * Copyright (c) 2020 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +package com.fulcrumgenomics.vcf.validation + +import com.fulcrumgenomics.vcf.api.VcfHeader._ +import com.fulcrumgenomics.vcf.api._ +import com.fulcrumgenomics.vcf.validation.ValidationResult._ + +import scala.collection.immutable.ArraySeq +import scala.collection.mutable.ListBuffer + +trait VariantValidator extends VcfValidator { + import Variant.{FlagValue, Missing, MissingChar, MissingFloat, MissingInt} + + def validate(variant: Variant): Seq[ValidationResult] + + private def toKind(value: Any): Seq[VcfFieldType] = value match { + case arr: ArrayAttr[_] => arr.flatMap(toKind) + case arr: ArraySeq[_] => arr.flatMap(toKind) + case MissingInt => Seq(VcfFieldType.Integer) + case _ if value.isInstanceOf[Int] => Seq(VcfFieldType.Integer) + case MissingFloat => Seq(VcfFieldType.Float) + case _ if value.isInstanceOf[Float] => Seq(VcfFieldType.Float) + case Missing => Seq(VcfFieldType.String) + case _ if value.isInstanceOf[String] => Seq(VcfFieldType.String) + case MissingChar => Seq(VcfFieldType.Character) + case _ if value.isInstanceOf[Char] => Seq(VcfFieldType.Character) + case FlagValue => Seq(VcfFieldType.Flag) // Note: could also be VcfFieldType.Fixed(0) + } + + protected def validate(kind: VcfFieldType, + count: VcfCount, + variant: Variant, + source: String, + value: Any, + genotype: Option[Genotype] = None): Seq[ValidationResult] = { + val builder = new ListBuffer[ValidationResult]() + val actualCount: Int = value match { + case _ if value == Variant.FlagValue => 0 + case arr: ArrayAttr[_] => arr.length + case arr: ArraySeq[_] => arr.length + case seq: Seq[_] => seq.length // should not happen, but for good measure + case _ => 1 + } + val expectedCount: Option[Int] = count match { + case VcfCount.OnePerAltAllele => Some(variant.alleles.alts.length) + case VcfCount.OnePerAllele => Some(variant.alleles.size) + case VcfCount.OnePerGenotype => Some(1) + case VcfCount.Unknown => None + case VcfCount.Fixed(n) => Some(n) + } + if (expectedCount.exists(_ != actualCount)) { + val _expectedCount = expectedCount.getOrElse(0) + builder.append(error( + s"$source expected `${_expectedCount}` values, found `$actualCount`", variant=Some(variant), genotype=genotype + )) + } + + val actualKind: Seq[VcfFieldType] = toKind(value=value) + val kindOk: Boolean = { + if (actualCount == 0 && actualKind.forall(_ == VcfFieldType.Flag)) { + // can also be VcfFieldType.Fixed(0) + kind == VcfFieldType.Flag || count == VcfCount.Fixed(0) + } + else { + actualKind.forall(_ == kind) + } + } + if (!kindOk) { + val _actualKind = actualKind match { + case Seq(_kind) => _kind + case kinds => kinds.distinct.mkString(",") + } + builder.append(error( + s"$source expected `$kind` kind, found `${_actualKind}`", variant=Some(variant), genotype=genotype + )) + } + + if (builder.isEmpty) IndexedSeq.empty[ValidationResult] else builder.toIndexedSeq + } +} + + + +object VariantValidator { + val VariantInfoValidators : Seq[VariantInfoValidator] = ReservedVcfInfoHeaders.map(VariantInfoValidator.apply) + + case class VariantInfoValidator(info: VcfInfoHeader) extends VariantValidator { + def validate(variant: Variant): Seq[ValidationResult] = { + variant.get[Any](info.id).toIndexedSeq.flatMap { value => + this.validate(kind=info.kind, count=info.count, variant=variant, source=f"INFO.${info.id}", value=value) + } + } + } + + case class VariantInfoExtraFieldValidator(infoKeys: Set[String]) extends VariantValidator { + def validate(variant: Variant): Seq[ValidationResult] = { + variant.attrs.keys + .filterNot(infoKeys.contains) + .map { id => + error(s"INFO.$id found in record but missing in header", variant = variant) + }.toIndexedSeq + } + } +} + +// TODO: validators across variants (ex. spanning alleles), or across genotypes (phase set) +sealed trait VariantsValidator { + def validate(variant: Variant): Seq[ValidationResult] + def finish(): Seq[ValidationResult] +} \ No newline at end of file diff --git a/src/main/scala/com/fulcrumgenomics/vcf/validation/VcfHeaderValidator.scala b/src/main/scala/com/fulcrumgenomics/vcf/validation/VcfHeaderValidator.scala new file mode 100644 index 000000000..ec6e9f7d4 --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/vcf/validation/VcfHeaderValidator.scala @@ -0,0 +1,127 @@ +/* + * The MIT License + * + * Copyright (c) 2020 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +package com.fulcrumgenomics.vcf.validation + +import com.fulcrumgenomics.vcf.api._ +import com.fulcrumgenomics.vcf.validation.ValidationResult._ + +import scala.collection.mutable.ListBuffer + +sealed trait VcfHeaderValidator extends VcfValidator { + def validate(header: VcfHeader): Seq[ValidationResult] +} + +object VcfHeaderValidator { + val Validators: Seq[VcfHeaderValidator] = Seq( + new VcfHeaderHasContigsValidator, + new VcfHeaderUniqueContigsValidator, + new VcfHeaderUniqueInfosValidator, + new VcfHeaderUniqueFormatsValidator + ) + + class VcfHeaderHasContigsValidator extends VcfHeaderValidator { + def validate(header: VcfHeader): Seq[ValidationResult] = { + if (header.contigs.nonEmpty) Seq.empty + else Seq(warning(message="No contig lines in the header.")) + } + } + + sealed trait UniqueValuesValidator[T<:VcfHeaderEntry] extends VcfHeaderValidator { + def toEntries(header: VcfHeader): Seq[T] + def toKey(entry: T): String + def name: String + def validate(header: VcfHeader): Seq[ValidationResult] = { + toEntries(header=header) + .groupBy(toKey) + .filter(_._2.length > 2) + .map { case (key, values) => + error(s"Found ${this.name} name ${values.length} times: `$key`") + }.toIndexedSeq + } + } + + class VcfHeaderUniqueContigsValidator extends UniqueValuesValidator[VcfContigHeader] { + def toEntries(header: VcfHeader): Seq[VcfContigHeader] = header.contigs + def toKey(entry: VcfContigHeader): String = entry.name + def name: String = "contig" + } + + class VcfHeaderUniqueInfosValidator extends UniqueValuesValidator[VcfInfoHeader] { + def toEntries(header: VcfHeader): Seq[VcfInfoHeader] = header.infos + def toKey(entry: VcfInfoHeader): String = entry.id + def name: String = "INFO.ID" + } + + class VcfHeaderUniqueFormatsValidator extends UniqueValuesValidator[VcfFormatHeader] { + def toEntries(header: VcfHeader): Seq[VcfFormatHeader] = header.formats + def toKey(entry: VcfFormatHeader): String = entry.id + def name: String = "FORMAT.ID" + } +} + +sealed trait VcfHeaderEntryValidator extends VcfValidator { + def validate(entry: VcfHeaderEntry): Seq[ValidationResult] +} + +object VcfHeaderEntryValidator { + + val ReservedVcfHeaderEntryValidators: Seq[VcfHeaderEntryValidator] = { + VcfHeader.ReservedVcfInfoHeaders.map(VcfInfoHeaderValidator.apply) ++ VcfHeader.ReservedVcfFormatHeaders.map(VcfFormatHeaderValidator.apply) + } + + trait VcfInfoOrFormatHeaderValidator extends VcfHeaderEntryValidator { + def base: VcfHeaderInfoOrFormatEntry + protected def errorPrefix: String + + def validate(entry: VcfHeaderEntry): Seq[ValidationResult] = entry match { + case _entry: VcfHeaderInfoOrFormatEntry if _entry.id == base.id => + val builder = new ListBuffer[ValidationResult]() + // Count + if (base.count != _entry.count) { + builder.append(error( + f"$errorPrefix: expected count `${base.count}` found `${_entry.count}`" + )) + } + // Kind + if (base.kind != _entry.kind) { + builder.append(error( + f"$errorPrefix: expected type `${base.kind}` found `${_entry.kind}`" + )) + } + if (builder.isEmpty) IndexedSeq.empty else builder.toIndexedSeq + case _ => Seq.empty + } + } + + case class VcfInfoHeaderValidator(base: VcfInfoHeader) extends VcfInfoOrFormatHeaderValidator { + protected val errorPrefix: String = f"Header INFO.${base.id}" + } + + case class VcfFormatHeaderValidator(base: VcfFormatHeader) extends VcfInfoOrFormatHeaderValidator { + protected val errorPrefix: String = f"Header FORMAT.${base.id}" + } +} + diff --git a/src/main/scala/com/fulcrumgenomics/vcf/validation/VcfValidator.scala b/src/main/scala/com/fulcrumgenomics/vcf/validation/VcfValidator.scala new file mode 100644 index 000000000..8e681e6d5 --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/vcf/validation/VcfValidator.scala @@ -0,0 +1,28 @@ +/* + * The MIT License + * + * Copyright (c) 2020 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + +package com.fulcrumgenomics.vcf.validation + +trait VcfValidator