Skip to content

Commit

Permalink
fix(backend, prepro): Remove '-' from list of allowed symbols in unal…
Browse files Browse the repository at this point in the history
…igned sequences. (#2728)

* Remove '-' from list of allowed symbols in unaligned sequences.

* Rename NUCLEOTIDE_SYMBOLS to UNALIGNED_NUCLEOTIDE_SYMBOLS to make check clearer.

* Distinguish between unaligned and aligned nucleotide symbols in backend

* Update backend tests to include '-'

---------

Co-authored-by: Fabian Engelniederhammer <fabian.engelniederhammer@tngtech.com>
  • Loading branch information
anna-parker and fengelniederhammer authored Sep 9, 2024
1 parent 4df831f commit 5c23096
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ enum class AminoAcidSymbols(override val symbol: Char) : Symbol {
STOP('*'),
}

enum class NucleotideSymbols(override val symbol: Char) : Symbol {
enum class AlignedNucleotideSymbols(override val symbol: Char) : Symbol {
A('A'),
C('C'),
G('G'),
Expand All @@ -74,6 +74,24 @@ enum class NucleotideSymbols(override val symbol: Char) : Symbol {
GAP('-'),
}

enum class NucleotideSymbols(override val symbol: Char) : Symbol {
A('A'),
C('C'),
G('G'),
T('T'),
M('M'),
R('R'),
W('W'),
S('S'),
Y('Y'),
K('K'),
V('V'),
H('H'),
D('D'),
B('B'),
N('N'),
}

private fun <T> validateNoUnknownInMetaData(data: Map<String, T>, known: List<String>) {
val unknownMetadataKeys = data.keys.subtract(known.toSet())
if (unknownMetadataKeys.isNotEmpty()) {
Expand Down Expand Up @@ -248,12 +266,12 @@ class ProcessedSequenceEntryValidator(private val schema: Schema, private val re
"nucleotideInsertions",
)

validateNoUnknownNucleotideSymbol(
validateNoUnknownNucleotideSymbol<AlignedNucleotideSymbols>(
processedData.alignedNucleotideSequences,
"alignedNucleotideSequences",
)

validateNoUnknownNucleotideSymbol(
validateNoUnknownNucleotideSymbol<NucleotideSymbols>(
processedData.unalignedNucleotideSequences,
"unalignedNucleotideSequences",
)
Expand Down Expand Up @@ -293,15 +311,15 @@ class ProcessedSequenceEntryValidator(private val schema: Schema, private val re
}
}

private fun validateNoUnknownNucleotideSymbol(
private inline fun <reified ValidSymbols> validateNoUnknownNucleotideSymbol(
dataToValidate: Map<String, GeneticSequence?>,
sequenceGrouping: String,
) {
) where ValidSymbols : Enum<ValidSymbols>, ValidSymbols : Symbol {
for ((segmentName, sequence) in dataToValidate) {
if (sequence == null) {
continue
}
val invalidSymbols = sequence.getInvalidSymbols<NucleotideSymbols>()
val invalidSymbols = sequence.getInvalidSymbols<ValidSymbols>()
if (invalidSymbols.isNotEmpty()) {
throw ProcessingValidationException(
"The sequence of segment '$segmentName' in '$sequenceGrouping' " +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ object PreparedProcessedData {
segment: SegmentName,
): SubmittedProcessedData {
val unalignedNucleotideSequences = defaultProcessedData.unalignedNucleotideSequences.toMutableMap()
unalignedNucleotideSequences[segment] = "ÄÖ" + unalignedNucleotideSequences[segment]!!.substring(2)
unalignedNucleotideSequences[segment] = "ÄÖ-" + unalignedNucleotideSequences[segment]!!.substring(2)

return defaultSuccessfulSubmittedData.copy(
accession = accession,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -586,8 +586,8 @@ class SubmitProcessedDataEndpointTest(
accession = "DoesNotMatter",
segment = "main",
),
expectedErrorMessage = "The sequence of segment 'main' in 'alignedNucleotideSequences' contains " +
"invalid symbols: [Ä, Ö].",
expectedErrorMessage = "The sequence of segment 'main' in 'alignedNucleotideSequences' " +
"contains invalid symbols: [Ä, Ö].",
),
InvalidDataScenario(
name = "data with segment in unaligned nucleotide sequences with wrong symbols",
Expand All @@ -597,7 +597,7 @@ class SubmitProcessedDataEndpointTest(
segment = "main",
),
expectedErrorMessage = "The sequence of segment 'main' in 'unalignedNucleotideSequences' contains " +
"invalid symbols: [Ä, Ö].",
"invalid symbols: [Ä, Ö, -].",
),
InvalidDataScenario(
name = "data with segment in nucleotide insertions with wrong symbols",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
SegmentName,
)

NUCLEOTIDE_SYMBOLS = {
UNALIGNED_NUCLEOTIDE_SYMBOLS = {
"A",
"C",
"G",
Expand All @@ -22,7 +22,6 @@
"D",
"B",
"N",
"-",
} # This list should always correspond at minimum to the check defined in the backend


Expand All @@ -32,7 +31,7 @@ def errors_if_non_iupac(
errors: list[ProcessingAnnotation] = []
for segment, sequence in unaligned_nucleotide_sequences.items():
if sequence:
non_iupac_symbols = set(sequence.upper()) - NUCLEOTIDE_SYMBOLS
non_iupac_symbols = set(sequence.upper()) - UNALIGNED_NUCLEOTIDE_SYMBOLS
if non_iupac_symbols:
errors.append(
ProcessingAnnotation(
Expand Down

0 comments on commit 5c23096

Please sign in to comment.