Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GA4GHTT-276 - minor update #252

Merged
merged 3 commits into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 33 additions & 29 deletions inc/vcf/file_structure.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,53 +168,51 @@ namespace ebi
{ ADR, { INTEGER, R } },
{ AF, { FLOAT, A } },
{ AN, { INTEGER, "1" } },
{ BKPTID, { STRING, UNKNOWN_CARDINALITY } },
{ BKPTID, { STRING, A } },
{ BQ, { FLOAT, "1" } },
{ CICN, { INTEGER, "2" } },
{ CICNADJ, { INTEGER, UNKNOWN_CARDINALITY } },
{ CIEND, { INTEGER, "2" } },
{ CICN, { FLOAT, UNKNOWN_CARDINALITY } },
{ CIEND, { INTEGER, UNKNOWN_CARDINALITY } },
{ CIGAR, { STRING, A } },
{ CILEN, { INTEGER, "2" } },
{ CIPOS, { INTEGER, "2" } },
{ CILEN, { INTEGER, UNKNOWN_CARDINALITY } },
{ CIPOS, { INTEGER, UNKNOWN_CARDINALITY } },
{ CIRB, { INTEGER, UNKNOWN_CARDINALITY } },
{ CIRUC, { FLOAT, UNKNOWN_CARDINALITY } },
{ CN, { FLOAT, "A" } },
{ CNADJ, { INTEGER, UNKNOWN_CARDINALITY } },
{ DB, { FLAG, "0" } },
{ DBRIPID, { STRING, "1" } },
{ DBVARID, { STRING, "1" } },
{ DGVID, { STRING, "1" } },
{ DBRIPID, { STRING, A } },
{ DBVARID, { STRING, A } },
{ DGVID, { STRING, A } },
{ DP, { INTEGER, "1" } },
{ DPADJ, { INTEGER, UNKNOWN_CARDINALITY } },
{ END, { INTEGER, "1" } },
{ EVENT, { STRING, "1" } },
{ HOMLEN, { INTEGER, UNKNOWN_CARDINALITY } },
{ HOMSEQ, { STRING, UNKNOWN_CARDINALITY } },
{ EVENT, { STRING, A } },
{ EVENTTYPE, { STRING, A } },
{ HOMLEN, { INTEGER, A } },
{ HOMSEQ, { STRING, A } },
{ H2, { FLAG, "0" } },
{ H3, { FLAG, "0" } },
{ IMPRECISE, { FLAG, "0" } },
{ MATEID, { STRING, UNKNOWN_CARDINALITY } },
{ MEINFO, { STRING, "4" } },
{ METRANS, { STRING, "4" } },
{ MATEID, { STRING, A } },
{ MEINFO, { STRING, UNKNOWN_CARDINALITY } },
{ METRANS, { STRING, UNKNOWN_CARDINALITY } },
{ MQ, { MISSING_VALUE, "1" } },
{ MQ0, { INTEGER, "1" } },
{ NOVEL, { FLAG, "0" } },
{ NS, { INTEGER, "1" } },
{ PARID, { STRING, "1" } },
{ PARID, { STRING, A } },
// TODO : SB metadata Type and Number is "."
{ RB, { INTEGER, UNKNOWN_CARDINALITY } },
{ RN, { INTEGER, "A" } },
{ RUB, { INTEGER, UNKNOWN_CARDINALITY } },
{ RUC, { FLOAT, UNKNOWN_CARDINALITY } },
{ RUL, { INTEGER, UNKNOWN_CARDINALITY } },
{ RUS, { STRING, UNKNOWN_CARDINALITY } },
{ SOMATIC, { FLAG, "0" } },
{ SVCLAIM, { STRING, "A" } },
{ SVLEN, { INTEGER, "A" } },
{ SVTYPE, { STRING, "1" } },
{ VALIDATED, { FLAG, "0" } },
{ THOUSAND_G, { FLAG, "0" } },
{ SVCLAIM, { STRING, "A" } },
{ RN, { INTEGER, "A" } },
{ RUS, { STRING, UNKNOWN_CARDINALITY } },
{ RUL, { INTEGER, UNKNOWN_CARDINALITY } },
{ RUC, { FLOAT, UNKNOWN_CARDINALITY } },
{ RB, { INTEGER, UNKNOWN_CARDINALITY } },
{ CIRUC, { FLOAT, UNKNOWN_CARDINALITY } },
{ CIRB, { INTEGER, UNKNOWN_CARDINALITY } },
{ RUB, { INTEGER, UNKNOWN_CARDINALITY } }
};
};

const std::map<std::string, std::pair<std::string, std::string>> format_v41_v42 = {
{ AHAP, { INTEGER, "1" } },
Expand Down Expand Up @@ -727,9 +725,15 @@ namespace ebi
void check_info_have_mandatory() const;

/**
* gets total RN count
* Gets total RN count
*/
int getRNvalue() const;

/**
* Checks field cardinality, used in addition to check_info_field_cardinality for explicit checks
*/
int check_info_field_cardinality_explicit(std::vector<std::string> const & values, size_t expected,
const std::string field) const;
};

std::ostream &operator<<(std::ostream &os, const Record &record);
Expand Down
19 changes: 10 additions & 9 deletions inc/vcf/string_constants.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ namespace ebi
const std::string CIGAR = "CIGAR";
const std::string CILEN = "CILEN";
const std::string CIPOS = "CIPOS";
const std::string CIRB = "CIRB";
const std::string CIRUC = "CIRUC";
const std::string CN = "CN";
const std::string CNADJ = "CNADJ";
const std::string DB = "DB";
Expand All @@ -144,6 +146,7 @@ namespace ebi
const std::string DPADJ = "DPADJ";
const std::string END = "END";
const std::string EVENT = "EVENT";
const std::string EVENTTYPE = "EVENTTYPE";
const std::string HOMLEN = "HOMLEN";
const std::string HOMSEQ = "HOMSEQ";
const std::string H2 = "H2";
Expand All @@ -157,21 +160,19 @@ namespace ebi
const std::string NOVEL = "NOVEL";
const std::string NS = "NS";
const std::string PARID = "PARID";
const std::string RB = "RB";
const std::string RN = "RN";
const std::string RUC = "RUC";
const std::string RUL = "RUL";
const std::string RUB = "RUB";
const std::string RUS = "RUS";
const std::string SB = "SB";
const std::string SOMATIC = "SOMATIC";
const std::string SVCLAIM = "SVCLAIM";
const std::string SVLEN = "SVLEN";
const std::string SVTYPE = "SVTYPE";
const std::string VALIDATED = "VALIDATED";
const std::string THOUSAND_G = "1000G";
const std::string SVCLAIM = "SVCLAIM";
const std::string RN = "RN";
const std::string RUS = "RUS";
const std::string RUL = "RUL";
const std::string RUC = "RUC";
const std::string RB = "RB";
const std::string CIRUC = "CIRUC";
const std::string CIRB = "CIRB";
const std::string RUB = "RUB";

// FORMAT predefined tags
const std::string AHAP = "AHAP";
Expand Down
12 changes: 6 additions & 6 deletions inc/vcf/validator_detail_v44.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
*/


#line 263 "src/vcf/vcf_v44.ragel"
#line 268 "src/vcf/vcf_v44.ragel"


namespace
Expand Down Expand Up @@ -185,7 +185,7 @@ static const int vcf_v44_en_meta_section_skip = 667;
static const int vcf_v44_en_body_section_skip = 668;


#line 269 "src/vcf/vcf_v44.ragel"
#line 274 "src/vcf/vcf_v44.ragel"

}

Expand All @@ -204,7 +204,7 @@ namespace ebi
cs = vcf_v44_start;
}

#line 283 "src/vcf/vcf_v44.ragel"
#line 288 "src/vcf/vcf_v44.ragel"

}

Expand Down Expand Up @@ -8133,11 +8133,11 @@ case 681:
}
break;
case 75:
#line 261 "src/vcf/vcf_v44.ragel"
#line 266 "src/vcf/vcf_v44.ragel"
{ {cs = 28;goto _again;} }
break;
case 76:
#line 262 "src/vcf/vcf_v44.ragel"
#line 267 "src/vcf/vcf_v44.ragel"
{ {cs = 673;goto _again;} }
break;
#line 8144 "inc/vcf/validator_detail_v44.hpp"
Expand Down Expand Up @@ -8632,7 +8632,7 @@ goto _again;}
_out: {}
}

#line 291 "src/vcf/vcf_v44.ragel"
#line 296 "src/vcf/vcf_v44.ragel"

}

Expand Down
92 changes: 61 additions & 31 deletions src/vcf/record.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -534,8 +534,7 @@ namespace ebi
throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + " value(s)"};
}
//RUL - RUS matching check made below with RUL
}
else if (field_key == RUL) { //repeat unit length
} else if (field_key == RUL) { //repeat unit length
if (source->version < Version::v44) { //not applicable for anything < v4.4
return;
}
Expand Down Expand Up @@ -563,8 +562,7 @@ namespace ebi
}
}
}
}
else if (field_key == RUC) { //repeat unit count
} else if (field_key == RUC) { //repeat unit count
if (source->version < Version::v44) { //not applicable for anything < v4.4
return;
}
Expand All @@ -574,8 +572,7 @@ namespace ebi
message << "INFO " << RUC << " for record at " << line << " must have " << rnCount << " value(s)";
throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + " value(s)"};
}
}
else if (field_key == RB) { //repeat bases
} else if (field_key == RB) { //repeat bases
if (source->version < Version::v44) { //not applicable for anything < v4.4
return;
}
Expand All @@ -585,20 +582,16 @@ namespace ebi
message << "INFO " << RB << " for record at " << line << " must have " << rnCount << " value(s)";
throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + " value(s)"};
}
}
else if (field_key == CIRUC) { //conf.interval repeat unit count
} else if (field_key == CIRUC) { //conf.interval repeat unit count
if (source->version < Version::v44) { //not applicable for anything < v4.4
return;
}
auto it = info.find(RUC);
if (it != info.end()) {
std::vector<std::string> RUCval;
util::string_split(it->second, ",", RUCval);
if (values.size() != 2 * RUCval.size()) { //ciruc count must be 2 * RUC count
std::stringstream message;
message << "INFO " << CIRUC << " for record at " << line << " must have " << 2 * RUCval.size() << " value(s)";
throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + " value(s)"};
}
//ciruc count must be 2 * RUC count
check_info_field_cardinality_explicit(values, 2 * RUCval.size(), CIRUC);
for (int i = 0; i < values.size(); ++i) {
if (RUCval[i / 2] == MISSING_VALUE) {
if (values[i] != MISSING_VALUE) { //ciruc must be missing with ruc missing
Expand All @@ -614,20 +607,16 @@ namespace ebi
message << "INFO " << CIRUC << " at " << line << " can not have values without " << RUC;
throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + "value(s)"};
}
}
else if (field_key == CIRB) {
} else if (field_key == CIRB) {
if (source->version < Version::v44) { //not applicable for anything < v4.4
return;
}
auto it = info.find(RB);
if (it != info.end()) {
std::vector<std::string> RBval;
util::string_split(it->second, ",", RBval);
if (values.size() != 2 * RBval.size()) { //cirb count must be 2 * RB count
std::stringstream message;
message << "INFO " << CIRB << " for record at " << line << " must have " << 2 * RBval.size() << " value(s)";
throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + " value(s)"};
}
//cirb count must be 2 * RB count
check_info_field_cardinality_explicit(values, 2 * RBval.size(), CIRB);
for (int i = 0; i < values.size(); ++i) {
if (RBval[i / 2] == MISSING_VALUE) {
if (values[i] != MISSING_VALUE) { //cirb must be missing with RB missing
Expand All @@ -643,8 +632,7 @@ namespace ebi
message << "INFO " << CIRB << " at " << line << " can not have values without " << RB;
throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + "value(s)"};
}
}
else if (field_key == RUB) {
} else if (field_key == RUB) {
if (source->version < Version::v44) { //not applicable for anything < v4.4
return;
}
Expand All @@ -667,18 +655,51 @@ namespace ebi
}
cnt += std::stoi(RUCval[i]);
}
if (cnt != values.size()) { //RUB size must be sum(RUC[i])
std::stringstream message;
message << "INFO " << RUB << " for record at " << line << " must have " << cnt << " value(s)";
throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + " value(s)"};
}
//RUB size must be sum(RUC[i])
check_info_field_cardinality_explicit(values, cnt, RUB);
}
else {
//must be present
std::stringstream message;
message << "INFO " << RUB << " for record at " << line << " must have " + RUC;
throw new InfoBodyError{line, message.str()};
}
} else if (field_key == MEINFO) {
if (source->version < Version::v44) { //not applicable for anything < v4.4
return;
}
//MEINFO must be 4 * ALT allele count
check_info_field_cardinality_explicit(values, 4 * alternate_alleles.size(), MEINFO);
} else if (field_key == METRANS) {
if (source->version < Version::v44) { //not applicable for anything < v4.4
return;
}
//METRANS must be 4 * ALT allele count
check_info_field_cardinality_explicit(values, 4 * alternate_alleles.size(), METRANS);
} else if (field_key == CICN) {
if (source->version < Version::v44) { //fixed size and already checked when < v44
return;
}
//CICN must be 2 * ALT allele count
check_info_field_cardinality_explicit(values, 2 * alternate_alleles.size(), CICN);
} else if (field_key == CIPOS) {
if (source->version < Version::v44) { //fixed size and already checked when < v44
return;
}
//CIPOS must be 2 * ALT allele count
check_info_field_cardinality_explicit(values, 2 * alternate_alleles.size(), CIPOS);
} else if (field_key == CIEND) {
if (source->version < Version::v44) { //fixed size and already checked when < v44
return;
}
//CIEND must be 2 * ALT allele count
check_info_field_cardinality_explicit(values, 2 * alternate_alleles.size(), CIEND);
} else if (field_key == CILEN) {
if (source->version < Version::v44) { //fixed size and already checked when < v44
return;
}
//CILEN must be 2 * ALT allele count
check_info_field_cardinality_explicit(values, 2 * alternate_alleles.size(), CILEN);
}
}

Expand Down Expand Up @@ -955,8 +976,8 @@ namespace ebi
+ " is not one of [A, R, G, ., <non-negative number>]"));
}
if(!values.empty()) {
if (values.front() == MISSING_VALUE) { return; } // No need to check missing data
} //TODO, if the 1st one is . then check stops; svclaim=.,DJ worked!
if (values.front() == MISSING_VALUE && values.size() == 1) { return; } // No need to check missing data
}

bool number_matches = true;
if (expected_cardinality > 0) {
Expand Down Expand Up @@ -1041,8 +1062,8 @@ namespace ebi
}

void Record::check_field_integer_range(std::string const & field, std::vector<std::string> const & values) const {
if (field == SVLEN || field == CIPOS || field == CIEND || field == CILEN || field == CICN || field == CICNADJ ||
field == CIRB) {
if (field == SVLEN || field == CIPOS || field == CIEND || field == CILEN || field == CIRB ||
(field == CICNADJ && source->version < Version::v44)) {
// to ignore predefined tag fields which permit negative integral values
return;
}
Expand Down Expand Up @@ -1077,6 +1098,15 @@ namespace ebi
return rnCnt;
}

int Record::check_info_field_cardinality_explicit(std::vector<std::string> const & values, size_t expected,
const std::string field) const {
if (values.size() != expected) {
std::stringstream message;
message << "INFO " << field << " for record at " << line << " must have " << expected << " value(s)";
throw new InfoBodyError{line, message.str(), "Found " + std::to_string(values.size()) + " value(s)"};
}
}

bool is_record_subfield_in_header(std::string const & field_value,
std::multimap<std::string, MetaEntry>::iterator begin,
std::multimap<std::string, MetaEntry>::iterator end)
Expand Down
6 changes: 5 additions & 1 deletion src/vcf/validate_optional_policy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,11 @@ namespace ebi

void ValidateOptionalPolicy::check_body_entry_info_confidence_interval(ParsingState & state, Record const & record) const
{
std::vector<std::string> confidence_interval_tags = { CICN, CICNADJ, CIEND, CILEN, CIPOS, CIRB, CIRUC };
std::vector<std::string> confidence_interval_tags = { CICN, CIEND, CILEN, CIPOS, CIRB, CIRUC };
if (record.source->version < vcf::Version::v44) {
std::vector<std::string> confidence_interval_tags_v43 = { CICN, CICNADJ, CIEND, CILEN, CIPOS, CIRB, CIRUC };
confidence_interval_tags = confidence_interval_tags_v43;
}
for (auto & confidence_interval_tag : confidence_interval_tags) {
auto it = record.info.find(confidence_interval_tag);
if (it != record.info.end()) {
Expand Down
Loading