Skip to content

Commit

Permalink
Merge branch 'master' of github.com:cyenyxe/vcf-validator
Browse files Browse the repository at this point in the history
  • Loading branch information
Cristina Yenyxe Gonzalez Garcia committed Jun 16, 2015
2 parents 3cf61da + 19e1791 commit 4346af2
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 35 deletions.
36 changes: 25 additions & 11 deletions inc/vcf/file_structure.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,25 @@ namespace opencb

typedef std::multimap<std::string, MetaEntry>::iterator meta_iterator;

enum InputFormat
{
VCF_FILE_VCF = 0x01,
VCF_FILE_GVCF = 0x02,
VCF_FILE_GZIP = 0x04,
VCF_FILE_BGZIP = 0x08,
VCF_FILE_BCF = 0x10,
};

enum class RecordType
{
SNV,
MNV,
INDEL,
STRUCTURAL,
STRUCTURAL_BREAKEND,
NO_VARIATION
};


struct MetaEntry
{
Expand Down Expand Up @@ -59,15 +78,6 @@ namespace opencb
void check_value();
};

enum InputFormat
{
VCF_FILE_VCF = 0x01,
VCF_FILE_GVCF = 0x02,
VCF_FILE_GZIP = 0x04,
VCF_FILE_BGZIP = 0x08,
VCF_FILE_BCF = 0x10,
};

struct Source
{
std::string name; /**< Name of the source to interact with (file, stdin...) */
Expand All @@ -92,7 +102,8 @@ namespace opencb

std::string reference_allele;
std::vector<std::string> alternate_alleles;

std::vector<RecordType> types;

float quality;
std::vector<std::string> filters;
std::map<std::string, std::string> info;
Expand Down Expand Up @@ -120,6 +131,9 @@ namespace opencb
bool operator!=(Record const &) const;

private:

void set_types();

/**
* Checks that chromosome does not contain colons or white-spaces
*
Expand Down Expand Up @@ -149,7 +163,7 @@ namespace opencb
*
* @throw std::invalid_argument
*/
void check_alternate_allele_structure(std::string const & alternate) const;
void check_alternate_allele_structure(std::string const & alternate, RecordType type) const;

/**
* Checks that alternates of the form <SOME_ALT_ID> are described in the meta section
Expand Down
2 changes: 2 additions & 0 deletions inc/vcf/validator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@ namespace opencb

private:
void check_body_entry_ploidy(ParsingState & state, Record & record);
void check_body_entry_reference_alternate_matching(ParsingState & state, Record & record);

void check_contig_meta(ParsingState & state, Record & record) const;
void check_alternate_allele_meta(ParsingState & state, Record & record) const;
void check_filter_meta(ParsingState & state, Record & record) const;
Expand Down
66 changes: 48 additions & 18 deletions src/vcf/record.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,15 @@ namespace opencb
ids{ids},
reference_allele{reference_allele},
alternate_alleles{alternate_alleles},
types{},
quality{quality},
filters{filters},
info{info},
format{format},
samples{samples},
source{source}
{
set_types();
check_chromosome();
check_ids();
check_alternate_alleles();
Expand Down Expand Up @@ -58,6 +60,25 @@ namespace opencb
return !(*this == other);
}

void Record::set_types()
{
for (int i = 0; i < alternate_alleles.size(); ++i) {
auto & alternate = alternate_alleles[i];
if (alternate == ".") {
types.push_back(RecordType::NO_VARIATION);
} else if (alternate[0] == '<') {
types.push_back(RecordType::STRUCTURAL);
} else if (std::count(alternate.begin(), alternate.end(), '[') == 2 ||
std::count(alternate.begin(), alternate.end(), ']') == 2) {
types.push_back(RecordType::STRUCTURAL_BREAKEND);
} else if (alternate.size() != reference_allele.size()) {
types.push_back(RecordType::INDEL);
} else {
types.push_back(alternate.size() == 1 ? RecordType::SNV : RecordType::MNV);
}
}
}

void Record::check_chromosome() const
{
if (chromosome.find(':') != std::string::npos) {
Expand All @@ -75,7 +96,7 @@ namespace opencb
}

for (auto & id : ids) {
if (find_if(id.begin(), id.end(), [](char c) { return c == ' ' || c == ';'; }) != id.end()) {
if (std::find_if(id.begin(), id.end(), [](char c) { return c == ' ' || c == ';'; }) != id.end()) {
throw std::invalid_argument("ID must not contain semicolons or whitespaces");
}
}
Expand All @@ -86,12 +107,15 @@ namespace opencb
static boost::regex square_brackets_regex("<([a-zA-Z0-9:_]+)>");
boost::cmatch pieces_match;

for (auto & alternate : alternate_alleles) {
for (size_t i = 0 ; i < alternate_alleles.size(); ++i) {
auto & alternate = alternate_alleles[i];
auto & type = types[i];

// Check alternate allele structure against the reference
check_alternate_allele_structure(alternate);
check_alternate_allele_structure(alternate, type);

// Check that an alternate of the form <SOME_ALT> begins with DEL, INS, DUP, INV or CNV
if (alternate[0] == '<' && regex_match(alternate.c_str(), pieces_match, square_brackets_regex)) {
if (alternate[0] == '<' && boost::regex_match(alternate.c_str(), pieces_match, square_brackets_regex)) {
std::string alt_id = pieces_match[1];
if (!boost::starts_with(alt_id, "DEL") &&
!boost::starts_with(alt_id, "INS") &&
Expand All @@ -105,22 +129,28 @@ namespace opencb

}

void Record::check_alternate_allele_structure(std::string const & alternate) const
void Record::check_alternate_allele_structure(std::string const & alternate, RecordType type) const
{
if (alternate == ".") {
if (alternate_alleles.size() > 1) {
throw std::invalid_argument("The no-alternate alleles symbol (dot) can not be combined with others");
}
} else if (alternate[0] == '<') {
return; // Custom ALTs can't be checked against the reference
} else if (std::count(alternate.begin(), alternate.end(), '[') == 2 ||
std::count(alternate.begin(), alternate.end(), ']') == 2) {
return; // Break-ends can't be checked against the reference
} else if (alternate[0] != reference_allele[0] && alternate.size() != reference_allele.size()) {
throw std::invalid_argument("Reference and alternate alleles must share the first nucleotide");
} else if (alternate == reference_allele) {
throw std::invalid_argument("Reference and alternate alleles must not be the same");
switch (type) {
case RecordType::NO_VARIATION:
if (alternate_alleles.size() > 1) {
throw std::invalid_argument("The no-alternate alleles symbol (dot) can not be combined with others");
}
break;
case RecordType::SNV:
case RecordType::MNV:
if (alternate == reference_allele) {
throw std::invalid_argument("Reference and alternate alleles must not be the same");
}
case RecordType::INDEL:
// Nothing to check
break;
case RecordType::STRUCTURAL:
case RecordType::STRUCTURAL_BREAKEND:
// Custom ALTs (STRUCTURAL) and break-ends (STRUCTURAL_BREAKEND) can't be checked against the reference
break;
}

}

void Record::check_quality() const
Expand Down
17 changes: 16 additions & 1 deletion src/vcf/validate_optional_policy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ namespace opencb
// All samples should have the same ploidy
check_body_entry_ploidy(state, record);

// Reference and alternate alleles in indels should share the first nucleotide
check_body_entry_reference_alternate_matching(state, record);

/*
* Once some meta-data is marked as in/correct there is no need again, so all the following have been
* optimised using a map for correctly defined meta-data and another one for incorrectly defined.
Expand Down Expand Up @@ -79,6 +82,18 @@ namespace opencb
}
}

void ValidateOptionalPolicy::check_body_entry_reference_alternate_matching(ParsingState & state, Record & record)
{
for (size_t i = 0; i < record.alternate_alleles.size(); ++i) {
auto & alternate = record.alternate_alleles[i];
auto type = record.types[i];

if (type == RecordType::INDEL && alternate[0] != record.reference_allele[0]) {
throw ParsingWarning("Reference and alternate alleles do not share the first nucleotide");
}
}
}

void ValidateOptionalPolicy::check_contig_meta(ParsingState & state, Record & record) const
{
// The associated 'contig' meta entry should exist (notify only once)
Expand Down Expand Up @@ -107,7 +122,7 @@ namespace opencb

for (auto & alternate : record.alternate_alleles) {
// Check alternate ID is present in meta-entry (only applies to the form <SOME_ALT_ID>)
if (alternate[0] == '<' && regex_match(alternate.c_str(), pieces_match, square_brackets_regex)) {
if (alternate[0] == '<' && boost::regex_match(alternate.c_str(), pieces_match, square_brackets_regex)) {
std::string alt_id = pieces_match[1];

if (state.is_bad_defined_meta("ALT", alt_id) ||
Expand Down
Loading

0 comments on commit 4346af2

Please sign in to comment.