diff --git a/lib/ProductOpener/Ingredients.pm b/lib/ProductOpener/Ingredients.pm index 9e55289d150fd..e5fe473fd674a 100644 --- a/lib/ProductOpener/Ingredients.pm +++ b/lib/ProductOpener/Ingredients.pm @@ -4859,8 +4859,33 @@ sub preparse_ingredients_text ($product_lc, $text) { # colorants alimentaires E (124,122,133,104,110) my $roman_numerals = "i|ii|iii|iv|v|vi|vii|viii|ix|x|xi|xii|xii|xiv|xv"; - my $additivesregexp - = '(\d{3}|\d{4})(( |-|\.)?([abcdefgh]))?(( |-|\.)?((' . $roman_numerals . ')|\((' . $roman_numerals . ')\)))?'; + my $additivesregexp; + # special cases, when $and (" a ", " e " or " i ") conflict with variants (E470a, E472e or E451i or E451(i)) + # in these cases, we fetch variant only if there is no space before + # E470a -> ok, E470 a -> not ok, E470 a, -> ok + # E451i -> ok, E451 i -> not ok, E451 i, -> ok + if ($and eq " a " || $and eq " e ") { + # based on $additivesregexp below in the else, with following modifications + # no space before abcdefgh + $additivesregexp + = '(\d{3}|\d{4})((-|\.)?([abcdefgh]))?(( |,|.)?((' . $roman_numerals . ')|\((' . $roman_numerals . ')\)))?'; + } + elsif ($and eq " i ") { + # based on $additivesregexp below in the else, with following modifications + # no space before i + $additivesregexp + = '(\d{3}|\d{4})(( |-|\.)?([abcdefgh]))?((-|\.)?((' + . $roman_numerals . ')|\((' + . $roman_numerals + . ')\)))?'; + } + else { + $additivesregexp + = '(\d{3}|\d{4})(( |-|\.)?([abcdefgh]))?(( |-|\.)?((' + . $roman_numerals . ')|\((' + . $roman_numerals + . ')\)))?'; + } $text =~ s/\b(e|ins|sin|i-n-s|s-i-n|i\.n\.s\.?|s\.i\.n\.?)(:|\(|\[| | n| nb|#|°)+((($additivesregexp)( |\/| \/ | - |,|, |$and))+($additivesregexp))\b(\s?(\)|\]))?/normalize_additives_enumeration($product_lc,$3)/ieg; @@ -4878,6 +4903,10 @@ sub preparse_ingredients_text ($product_lc, $text) { # Canonicalize additives to remove the dash that can make further parsing break # Match E + number + letter a to h + i to xv, followed by a space or separator + # $3 would be either \d{3} or \d{4} in $additivesregexp + # $6 would be ([abcdefgh]) in $additivesregexp + # $9 would be (( |-|\.)?((' . $roman_numerals . ')|\((' . $roman_numerals . ')\))) in $additivesregexp + # $12 would be (\b|\s|,|\.|;|\/|-|\\|\)|\]|$) $text =~ s/(\b)e( |-|\.)?$additivesregexp(\b|\s|,|\.|;|\/|-|\\|\)|\]|$)/replace_additive($3,$6,$9) . $12/ieg; # E100 et E120 -> E100, E120 diff --git a/tests/unit/ingredients_parsing.t b/tests/unit/ingredients_parsing.t index 0402b05a81f6b..499357b6d7b34 100755 --- a/tests/unit/ingredients_parsing.t +++ b/tests/unit/ingredients_parsing.t @@ -600,7 +600,32 @@ my @lists = ( ["ru", "масло растительное (подсолнечное, соевое)", "масло растительное подсолнечное, масло растительное соевое"], # grammes -> g - ["fr", "Teneur en fruits: 50gr pour 100 grammes", "Teneur en fruits: 50g pour 100 g"] + ["fr", "Teneur en fruits: 50gr pour 100 grammes", "Teneur en fruits: 50g pour 100 g"], + + # test conflicts between the word "and" in some languages and additives variants. With letters i or e or a. + [ + "hr", + "bojilo: E 150a, tvari za rahljenje: E 500 i E 503, sol.", + "bojilo: e150a, tvari za rahljenje: e500, e503, sol." + ], + [ + "hr", + "bojilo: E 150a, tvari za rahljenje: E 500 i, E 503, sol.", + "bojilo: e150a, tvari za rahljenje: e500 i, e503, sol." + ], + [ + "hr", + "bojilo: E 150a, tvari za rahljenje: E 500(i), E 503, sol.", + "bojilo: e150a, tvari za rahljenje: e500i, e503, sol." + ], + [ + "hr", + "bojilo: E 150a, tvari za rahljenje: E 500i, E 503, sol.", + "bojilo: e150a, tvari za rahljenje: e500i, e503, sol." + ], + ["it", "formaggio, E 472 e, E470a.", "formaggio, e472 e, e470a."], + ["it", "formaggio, E 472 e E470a.", "formaggio, e472, e470a."], + ["sk", "syr, E470 a E470a, mlieko.", "syr, e470, e470a, mlieko."] ); foreach my $test_ref (@lists) {