Skip to content

Commit

Permalink
REF/ALT must share the first nucleotide changed from error to warning…
Browse files Browse the repository at this point in the history
…, record type enum added to support this feature and simplify others
  • Loading branch information
cyenyxe committed Jun 9, 2015
1 parent 6982fc1 commit 19e1791
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 35 deletions.
36 changes: 25 additions & 11 deletions inc/vcf/file_structure.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,25 @@ namespace opencb

typedef std::multimap<std::string, MetaEntry>::iterator meta_iterator;

enum InputFormat
{
VCF_FILE_VCF = 0x01,
VCF_FILE_GVCF = 0x02,
VCF_FILE_GZIP = 0x04,
VCF_FILE_BGZIP = 0x08,
VCF_FILE_BCF = 0x10,
};

enum class RecordType
{
SNV,
MNV,
INDEL,
STRUCTURAL,
STRUCTURAL_BREAKEND,
NO_VARIATION
};


struct MetaEntry
{
Expand Down Expand Up @@ -59,15 +78,6 @@ namespace opencb
void check_value();
};

enum InputFormat
{
VCF_FILE_VCF = 0x01,
VCF_FILE_GVCF = 0x02,
VCF_FILE_GZIP = 0x04,
VCF_FILE_BGZIP = 0x08,
VCF_FILE_BCF = 0x10,
};

struct Source
{
std::string name; /**< Name of the source to interact with (file, stdin...) */
Expand All @@ -92,7 +102,8 @@ namespace opencb

std::string reference_allele;
std::vector<std::string> alternate_alleles;

std::vector<RecordType> types;

float quality;
std::vector<std::string> filters;
std::map<std::string, std::string> info;
Expand Down Expand Up @@ -120,6 +131,9 @@ namespace opencb
bool operator!=(Record const &) const;

private:

void set_types();

/**
* Checks that chromosome does not contain colons or white-spaces
*
Expand Down Expand Up @@ -149,7 +163,7 @@ namespace opencb
*
* @throw std::invalid_argument
*/
void check_alternate_allele_structure(std::string const & alternate) const;
void check_alternate_allele_structure(std::string const & alternate, RecordType type) const;

/**
* Checks that alternates of the form <SOME_ALT_ID> are described in the meta section
Expand Down
2 changes: 2 additions & 0 deletions inc/vcf/validator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@ namespace opencb

private:
void check_body_entry_ploidy(ParsingState & state, Record & record);
void check_body_entry_reference_alternate_matching(ParsingState & state, Record & record);

void check_contig_meta(ParsingState & state, Record & record) const;
void check_alternate_allele_meta(ParsingState & state, Record & record) const;
void check_filter_meta(ParsingState & state, Record & record) const;
Expand Down
66 changes: 48 additions & 18 deletions src/vcf/record.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@ namespace opencb
ids{ids},
reference_allele{reference_allele},
alternate_alleles{alternate_alleles},
types{},
quality{quality},
filters{filters},
info{info},
format{format},
samples{samples},
source{source}
{
set_types();
check_chromosome();
check_ids();
check_alternate_alleles();
Expand Down Expand Up @@ -58,6 +60,25 @@ namespace opencb
return !(*this == other);
}

void Record::set_types()
{
for (int i = 0; i < alternate_alleles.size(); ++i) {
auto & alternate = alternate_alleles[i];
if (alternate == ".") {
types.push_back(RecordType::NO_VARIATION);
} else if (alternate[0] == '<') {
types.push_back(RecordType::STRUCTURAL);
} else if (std::count(alternate.begin(), alternate.end(), '[') == 2 ||
std::count(alternate.begin(), alternate.end(), ']') == 2) {
types.push_back(RecordType::STRUCTURAL_BREAKEND);
} else if (alternate.size() != reference_allele.size()) {
types.push_back(RecordType::INDEL);
} else {
types.push_back(alternate.size() == 1 ? RecordType::SNV : RecordType::MNV);
}
}
}

void Record::check_chromosome() const
{
if (chromosome.find(':') != std::string::npos) {
Expand All @@ -75,7 +96,7 @@ namespace opencb
}

for (auto & id : ids) {
if (find_if(id.begin(), id.end(), [](char c) { return c == ' ' || c == ';'; }) != id.end()) {
if (std::find_if(id.begin(), id.end(), [](char c) { return c == ' ' || c == ';'; }) != id.end()) {
throw std::invalid_argument("ID must not contain semicolons or whitespaces");
}
}
Expand All @@ -86,12 +107,15 @@ namespace opencb
static boost::regex square_brackets_regex("<([a-zA-Z0-9:_]+)>");
boost::cmatch pieces_match;

for (auto & alternate : alternate_alleles) {
for (size_t i = 0 ; i < alternate_alleles.size(); ++i) {
auto & alternate = alternate_alleles[i];
auto & type = types[i];

// Check alternate allele structure against the reference
check_alternate_allele_structure(alternate);
check_alternate_allele_structure(alternate, type);

// Check that an alternate of the form <SOME_ALT> begins with DEL, INS, DUP, INV or CNV
if (alternate[0] == '<' && regex_match(alternate.c_str(), pieces_match, square_brackets_regex)) {
if (alternate[0] == '<' && boost::regex_match(alternate.c_str(), pieces_match, square_brackets_regex)) {
std::string alt_id = pieces_match[1];
if (!boost::starts_with(alt_id, "DEL") &&
!boost::starts_with(alt_id, "INS") &&
Expand All @@ -105,22 +129,28 @@ namespace opencb

}

void Record::check_alternate_allele_structure(std::string const & alternate) const
void Record::check_alternate_allele_structure(std::string const & alternate, RecordType type) const
{
if (alternate == ".") {
if (alternate_alleles.size() > 1) {
throw std::invalid_argument("The no-alternate alleles symbol (dot) can not be combined with others");
}
} else if (alternate[0] == '<') {
return; // Custom ALTs can't be checked against the reference
} else if (std::count(alternate.begin(), alternate.end(), '[') == 2 ||
std::count(alternate.begin(), alternate.end(), ']') == 2) {
return; // Break-ends can't be checked against the reference
} else if (alternate[0] != reference_allele[0] && alternate.size() != reference_allele.size()) {
throw std::invalid_argument("Reference and alternate alleles must share the first nucleotide");
} else if (alternate == reference_allele) {
throw std::invalid_argument("Reference and alternate alleles must not be the same");
switch (type) {
case RecordType::NO_VARIATION:
if (alternate_alleles.size() > 1) {
throw std::invalid_argument("The no-alternate alleles symbol (dot) can not be combined with others");
}
break;
case RecordType::SNV:
case RecordType::MNV:
if (alternate == reference_allele) {
throw std::invalid_argument("Reference and alternate alleles must not be the same");
}
case RecordType::INDEL:
// Nothing to check
break;
case RecordType::STRUCTURAL:
case RecordType::STRUCTURAL_BREAKEND:
// Custom ALTs (STRUCTURAL) and break-ends (STRUCTURAL_BREAKEND) can't be checked against the reference
break;
}

}

void Record::check_quality() const
Expand Down
17 changes: 16 additions & 1 deletion src/vcf/validate_optional_policy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ namespace opencb
// All samples should have the same ploidy
check_body_entry_ploidy(state, record);

// Reference and alternate alleles in indels should share the first nucleotide
check_body_entry_reference_alternate_matching(state, record);

/*
* Once some meta-data is marked as in/correct there is no need again, so all the following have been
* optimised using a map for correctly defined meta-data and another one for incorrectly defined.
Expand Down Expand Up @@ -79,6 +82,18 @@ namespace opencb
}
}

void ValidateOptionalPolicy::check_body_entry_reference_alternate_matching(ParsingState & state, Record & record)
{
for (size_t i = 0; i < record.alternate_alleles.size(); ++i) {
auto & alternate = record.alternate_alleles[i];
auto type = record.types[i];

if (type == RecordType::INDEL && alternate[0] != record.reference_allele[0]) {
throw ParsingWarning("Reference and alternate alleles do not share the first nucleotide");
}
}
}

void ValidateOptionalPolicy::check_contig_meta(ParsingState & state, Record & record) const
{
// The associated 'contig' meta entry should exist (notify only once)
Expand Down Expand Up @@ -107,7 +122,7 @@ namespace opencb

for (auto & alternate : record.alternate_alleles) {
// Check alternate ID is present in meta-entry (only applies to the form <SOME_ALT_ID>)
if (alternate[0] == '<' && regex_match(alternate.c_str(), pieces_match, square_brackets_regex)) {
if (alternate[0] == '<' && boost::regex_match(alternate.c_str(), pieces_match, square_brackets_regex)) {
std::string alt_id = pieces_match[1];

if (state.is_bad_defined_meta("ALT", alt_id) ||
Expand Down
Loading

0 comments on commit 19e1791

Please sign in to comment.