From 5c2309692014a00fa8a4ec2f1830cf2f8cf00910 Mon Sep 17 00:00:00 2001 From: "Anna (Anya) Parker" <50943381+anna-parker@users.noreply.github.com> Date: Mon, 9 Sep 2024 15:32:17 +0200 Subject: [PATCH] fix(backend, prepro): Remove '-' from list of allowed symbols in unaligned sequences. (#2728) * Remove '-' from list of allowed symbols in unaligned sequences. * Rename NUCLEOTIDE_SYMBOLS to UNALIGNED_NUCLEOTIDE_SYMBOLS to make check clearer. * Distinguish between unaligned and aligned nucleotide symbols in backend * Update backend tests to include '-' --------- Co-authored-by: Fabian Engelniederhammer --- .../ProcessedSequenceEntryValidator.kt | 30 +++++++++++++++---- .../submission/PreparedProcessedData.kt | 2 +- .../SubmitProcessedDataEndpointTest.kt | 6 ++-- .../loculus_preprocessing/sequence_checks.py | 5 ++-- 4 files changed, 30 insertions(+), 13 deletions(-) diff --git a/backend/src/main/kotlin/org/loculus/backend/service/submission/ProcessedSequenceEntryValidator.kt b/backend/src/main/kotlin/org/loculus/backend/service/submission/ProcessedSequenceEntryValidator.kt index 2db07346e..ca8e3a18e 100644 --- a/backend/src/main/kotlin/org/loculus/backend/service/submission/ProcessedSequenceEntryValidator.kt +++ b/backend/src/main/kotlin/org/loculus/backend/service/submission/ProcessedSequenceEntryValidator.kt @@ -55,7 +55,7 @@ enum class AminoAcidSymbols(override val symbol: Char) : Symbol { STOP('*'), } -enum class NucleotideSymbols(override val symbol: Char) : Symbol { +enum class AlignedNucleotideSymbols(override val symbol: Char) : Symbol { A('A'), C('C'), G('G'), @@ -74,6 +74,24 @@ enum class NucleotideSymbols(override val symbol: Char) : Symbol { GAP('-'), } +enum class NucleotideSymbols(override val symbol: Char) : Symbol { + A('A'), + C('C'), + G('G'), + T('T'), + M('M'), + R('R'), + W('W'), + S('S'), + Y('Y'), + K('K'), + V('V'), + H('H'), + D('D'), + B('B'), + N('N'), +} + private fun validateNoUnknownInMetaData(data: Map, known: List) { val unknownMetadataKeys = data.keys.subtract(known.toSet()) if (unknownMetadataKeys.isNotEmpty()) { @@ -248,12 +266,12 @@ class ProcessedSequenceEntryValidator(private val schema: Schema, private val re "nucleotideInsertions", ) - validateNoUnknownNucleotideSymbol( + validateNoUnknownNucleotideSymbol( processedData.alignedNucleotideSequences, "alignedNucleotideSequences", ) - validateNoUnknownNucleotideSymbol( + validateNoUnknownNucleotideSymbol( processedData.unalignedNucleotideSequences, "unalignedNucleotideSequences", ) @@ -293,15 +311,15 @@ class ProcessedSequenceEntryValidator(private val schema: Schema, private val re } } - private fun validateNoUnknownNucleotideSymbol( + private inline fun validateNoUnknownNucleotideSymbol( dataToValidate: Map, sequenceGrouping: String, - ) { + ) where ValidSymbols : Enum, ValidSymbols : Symbol { for ((segmentName, sequence) in dataToValidate) { if (sequence == null) { continue } - val invalidSymbols = sequence.getInvalidSymbols() + val invalidSymbols = sequence.getInvalidSymbols() if (invalidSymbols.isNotEmpty()) { throw ProcessingValidationException( "The sequence of segment '$segmentName' in '$sequenceGrouping' " + diff --git a/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedProcessedData.kt b/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedProcessedData.kt index 235a919bf..5b7c9a6eb 100644 --- a/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedProcessedData.kt +++ b/backend/src/test/kotlin/org/loculus/backend/controller/submission/PreparedProcessedData.kt @@ -307,7 +307,7 @@ object PreparedProcessedData { segment: SegmentName, ): SubmittedProcessedData { val unalignedNucleotideSequences = defaultProcessedData.unalignedNucleotideSequences.toMutableMap() - unalignedNucleotideSequences[segment] = "ÄÖ" + unalignedNucleotideSequences[segment]!!.substring(2) + unalignedNucleotideSequences[segment] = "ÄÖ-" + unalignedNucleotideSequences[segment]!!.substring(2) return defaultSuccessfulSubmittedData.copy( accession = accession, diff --git a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitProcessedDataEndpointTest.kt b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitProcessedDataEndpointTest.kt index d0f10b58e..ba1100877 100644 --- a/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitProcessedDataEndpointTest.kt +++ b/backend/src/test/kotlin/org/loculus/backend/controller/submission/SubmitProcessedDataEndpointTest.kt @@ -586,8 +586,8 @@ class SubmitProcessedDataEndpointTest( accession = "DoesNotMatter", segment = "main", ), - expectedErrorMessage = "The sequence of segment 'main' in 'alignedNucleotideSequences' contains " + - "invalid symbols: [Ä, Ö].", + expectedErrorMessage = "The sequence of segment 'main' in 'alignedNucleotideSequences' " + + "contains invalid symbols: [Ä, Ö].", ), InvalidDataScenario( name = "data with segment in unaligned nucleotide sequences with wrong symbols", @@ -597,7 +597,7 @@ class SubmitProcessedDataEndpointTest( segment = "main", ), expectedErrorMessage = "The sequence of segment 'main' in 'unalignedNucleotideSequences' contains " + - "invalid symbols: [Ä, Ö].", + "invalid symbols: [Ä, Ö, -].", ), InvalidDataScenario( name = "data with segment in nucleotide insertions with wrong symbols", diff --git a/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py b/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py index d8f641109..7a2553a1f 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/sequence_checks.py @@ -6,7 +6,7 @@ SegmentName, ) -NUCLEOTIDE_SYMBOLS = { +UNALIGNED_NUCLEOTIDE_SYMBOLS = { "A", "C", "G", @@ -22,7 +22,6 @@ "D", "B", "N", - "-", } # This list should always correspond at minimum to the check defined in the backend @@ -32,7 +31,7 @@ def errors_if_non_iupac( errors: list[ProcessingAnnotation] = [] for segment, sequence in unaligned_nucleotide_sequences.items(): if sequence: - non_iupac_symbols = set(sequence.upper()) - NUCLEOTIDE_SYMBOLS + non_iupac_symbols = set(sequence.upper()) - UNALIGNED_NUCLEOTIDE_SYMBOLS if non_iupac_symbols: errors.append( ProcessingAnnotation(