Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GA4GHTT-261 - 44 1 #242

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions inc/vcf/file_structure.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ namespace ebi
{ CIGAR, { STRING, A } },
{ CILEN, { INTEGER, "2" } },
{ CIPOS, { INTEGER, "2" } },
{ CN, { INTEGER, "1" } },
{ CN, { FLOAT, "A" } },
{ CNADJ, { INTEGER, UNKNOWN_CARDINALITY } },
{ DB, { FLAG, "0" } },
{ DBRIPID, { STRING, "1" } },
Expand All @@ -201,10 +201,11 @@ namespace ebi
{ PARID, { STRING, "1" } },
// TODO : SB metadata Type and Number is "."
{ SOMATIC, { FLAG, "0" } },
{ SVLEN, { INTEGER, UNKNOWN_CARDINALITY } },
{ SVLEN, { INTEGER, "A" } },
{ SVTYPE, { STRING, "1" } },
{ VALIDATED, { FLAG, "0" } },
{ THOUSAND_G, { FLAG, "0" } }
{ THOUSAND_G, { FLAG, "0" } },
{ SVCLAIM, { STRING, "A" } }
};

const std::map<std::string, std::pair<std::string, std::string>> format_v41_v42 = {
Expand Down Expand Up @@ -259,7 +260,7 @@ namespace ebi
{ ADF, { INTEGER, R } },
{ ADR, { INTEGER, R } },
{ AHAP, { INTEGER, "1" } },
{ CN, { INTEGER, "1" } },
{ CN, { FLOAT, "1" } },
{ CNL, { FLOAT, G } },
{ CNP, { FLOAT, G } },
{ CNQ, { FLOAT, "1" } },
Expand Down Expand Up @@ -288,6 +289,16 @@ namespace ebi
BND,
};

const std::map<std::string, const std::set<std::string>> PREDEFINED_INFO_ALLELE_SVCLAIM{
{ DEL, { D, J, DJ } },
{ DUP, { D, J, DJ } },
{ CNV, { D, MISSING_VALUE } },
{ INV, { J, DJ, MISSING_VALUE } },
{ INS, { J, DJ, MISSING_VALUE } },
{ BND, { J, MISSING_VALUE } },
{ MISSING_VALUE, { MISSING_VALUE } }
};

inline std::string const &get_predefined_type(
std::map<std::string, std::pair<std::string, std::string>>::const_iterator const &predefined_tag)
{
Expand Down Expand Up @@ -698,6 +709,13 @@ namespace ebi
* @throw std::invalid_argument
*/
void check_field_integer_range(std::string const & field, std::vector<std::string> const & value) const;

/**
* Checks that INFO contains required mandatory fields when applicable
*
* @throw InfoBodyError
*/
void check_info_have_mandatory() const;
};

std::ostream &operator<<(std::ostream &os, const Record &record);
Expand Down
6 changes: 6 additions & 0 deletions inc/vcf/string_constants.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ namespace ebi
const std::string SVTYPE = "SVTYPE";
const std::string VALIDATED = "VALIDATED";
const std::string THOUSAND_G = "1000G";
const std::string SVCLAIM = "SVCLAIM";

// FORMAT predefined tags
const std::string AHAP = "AHAP";
Expand Down Expand Up @@ -210,6 +211,11 @@ namespace ebi

// ENA REST API for remote fasta access
const std::string ENA_API_FASTA_URL = "https://www.ebi.ac.uk/ena/browser/api/fasta/";

//SVCLAIM types
const std::string D = "D";
const std::string J = "J";
const std::string DJ = "DJ";
}
}

Expand Down
84 changes: 78 additions & 6 deletions src/vcf/record.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ namespace ebi
void Record::check_info() const
{
check_info_no_duplicates();
check_info_have_mandatory();

typedef std::multimap<std::string, MetaEntry>::iterator iter;
std::pair<iter, iter> range = source->meta_entries.equal_range(INFO);
Expand Down Expand Up @@ -306,6 +307,34 @@ namespace ebi
}
}

void Record::check_info_have_mandatory() const
{
static boost::regex deldup_regex("(<(DUP|DEL)(:[^>]+)*>)+");
//boost::cmatch pieces_match;

if (source->version == Version::v44) { //not applicable for 4.1/2/3
for (size_t i = 0; i < alternate_alleles.size(); ++i ) {
//SVLEN must for all SV
auto & alternate = alternate_alleles[i];
if (types[i] == RecordType::STRUCTURAL || types[i] == RecordType::STRUCTURAL_BREAKEND) {
if (info.find(SVLEN) == info.end()) {
std::stringstream message;
message << "INFO " << SVLEN << " must be present for structural variants";
throw new InfoBodyError{line, message.str()};
}
if (types[i] == RecordType::STRUCTURAL && boost::regex_match(alternate, deldup_regex)) {
//del/dup tags must have svclaim
if (info.find(SVCLAIM) == info.end()) {
std::stringstream message;
message << "INFO " << SVCLAIM << " must be present for DEL/DUP";
throw new InfoBodyError{line, message.str()};
}
}
}
}
}
}

void Record::check_format() const
{
if (format.size() == 0) {
Expand Down Expand Up @@ -408,6 +437,7 @@ namespace ebi
}
}
} else if (field_key == SVLEN && values.size() == alternate_alleles.size()) {
//are spec page 21 samples correct? svlen not found for BND TODO
for (size_t i = 0; i < alternate_alleles.size(); i++) {
if (check_alt_not_symbolic(i)) {
std::string expected = std::to_string(static_cast<long>(alternate_alleles[i].size()) - static_cast<long>(reference_allele.size()));
Expand All @@ -416,12 +446,12 @@ namespace ebi
"non-symbolic alternate alleles", "SVLEN=" + field_value + ", expected value=" + expected,
ErrorFix::RECOVERABLE_VALUE, field_key, expected};
}
} else {
} else if (source->version < Version::v44) {
std::string first_field = alternate_alleles[i].substr(0, 4);
if (first_field == "<" + INS || first_field == "<" + DUP) {
size_t scanned_value_length;
int value = std::stoi(values[i], &scanned_value_length);
if (value < 0 || scanned_value_length != values[i].size()) {
size_t scanned_value_length;
int value = std::stoi(values[i], &scanned_value_length);
if (value < 0 || scanned_value_length != values[i].size()) {
throw new InfoBodyError{line, "INFO SVLEN must be a positive integer for longer ALT alleles", "SVLEN="
+ field_value + ", ALT allele=" + first_field.substr(1, 3),
ErrorFix::IRRECOVERABLE_VALUE, field_key};
Expand All @@ -430,12 +460,21 @@ namespace ebi
size_t scanned_value_length;
int value = std::stoi(values[i], &scanned_value_length);
if (value > 0 || scanned_value_length != values[i].size()) {
throw new InfoBodyError{line, "INFO SVLEN must be a negative integer for shorter ALT alleles"
throw new InfoBodyError{line, "INFO SVLEN must be a negative integer for shorter ALT alleles "
+ first_field.substr(1,3), "SVLEN=" + field_value + ", ALT allele=" + first_field.substr(1, 3),
ErrorFix::IRRECOVERABLE_VALUE, field_key};
}
}
}
else {
//v44 symbolic alleles
//TODO anything to do with CNV:TR novel TR 1 case? pg34

//for alleles other than SV, value should be '.'
if (types[i] != RecordType::STRUCTURAL && values[i] != MISSING_VALUE) {
throw new InfoBodyError{line, "INFO SVLEN should be " + MISSING_VALUE + " for SV other than INS/INV/DUP/DEL/CNV"};
}
}
}
} else if (field_key == SVTYPE) {
if (!ebi::util::contains(PREDEFINED_INFO_SVTYPES, field_value)) {
Expand All @@ -444,6 +483,39 @@ namespace ebi
ebi::util::print_container(message, PREDEFINED_INFO_SVTYPES, "", ", ", "");
throw new InfoBodyError{line, message.str(), "Found " + SVTYPE + " was '" + field_value + "'"};
}
} else if (source->version == Version::v44 && field_key == SVCLAIM && values.size() == alternate_alleles.size()) {
//not applicable for anything < v4.4
static boost::regex allele_regex("<(DUP|DEL|INS|INV|CNV)(:[^>]+)*>");
boost::cmatch pieces_match;

for (size_t i = 0; i < alternate_alleles.size(); ++i) {
std::string key;
auto & allele = alternate_alleles[i];
if (types[i] == RecordType::STRUCTURAL) {
if (boost::regex_match(allele.c_str(), pieces_match, allele_regex)) {
key = pieces_match[1]; //one of the tokens from allle_regex
} else {
key = MISSING_VALUE; //unknown SV, accept only '.'
}
} else if (types[i] == RecordType::STRUCTURAL_BREAKEND) {
key = BND;
} else {
key = MISSING_VALUE; //anything other than SV, svclaim can be '.'
}
const auto & iter = PREDEFINED_INFO_ALLELE_SVCLAIM.find(key);
if (iter != PREDEFINED_INFO_ALLELE_SVCLAIM.end()) {
auto & validvals = iter->second;
if (!ebi::util::contains(validvals, values[i])) {
std::stringstream message;
message << "INFO " << SVCLAIM << " must be one of: ";
ebi::util::print_container(message, validvals, "", ", ", "");
throw new InfoBodyError{line, message.str(), "Found " + SVCLAIM + " was '" + values[i] + "'"};
}
}
else {
BOOST_LOG_TRIVIAL(error) << "Invalid symbolic allele" << key << std::endl;
}
}
}
}

Expand Down Expand Up @@ -704,7 +776,7 @@ namespace ebi
}
if(!values.empty()) {
if (values.front() == MISSING_VALUE) { return; } // No need to check missing data
}
} //TODO, if the 1st one is . then check stops; is that correct? svclaim=.,DJ worked!
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not correct indeed:

SVCLAIM=.

Is correct because missing value applies to all value

SVCLAIM=.,DJ

Needs to be checked for cardinality


bool number_matches = true;
if (expected_cardinality > 0) {
Expand Down
1 change: 1 addition & 0 deletions src/vcf/store_parse_policy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ namespace ebi
break;
case 9:
m_line_tokens[FORMAT] = m_grouped_tokens;
//TODO: is this OK? format is optional field or not?
break;
default:
// Collection of samples
Expand Down
Loading