Skip to content

Commit e15ab1b

Browse files
authored
Pass homoglyph variants of names to trademark checks and NARC (#23842)
* Pass homoglyph variants of names to trademark checks and NARC
1 parent 9abf8a2 commit e15ab1b

File tree

7 files changed

+210
-34
lines changed

7 files changed

+210
-34
lines changed

requirements/prod.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -799,6 +799,9 @@ cachetools==5.5.2 \
799799
sqlparse==0.5.3 \
800800
--hash=sha256:09f67787f56a0b16ecdbde1bfc7f5d9c3371ca683cfeaa8e6ff60b4807ec9272 \
801801
--hash=sha256:cf2196ed3418f3ba5de6af7e82c694a9fbdbfecccdfc72e281548517081f16ca
802+
homoglyphs-fork==2.1.1 \
803+
--hash=sha256:42b8e77e82fff9eaa63422e3ff3149daa0c8ecc03fb2381e6aae5a961c3104c2 \
804+
--hash=sha256:d0994a1f5fa74b6bd0a25e714eb527a43335c80d0e86abf6f099d0e108dc94bd
802805
yara-python==4.5.4 \
803806
--hash=sha256:0b9de86fbe8a646c0644df9e1396d6941dc6ed0f89be2807e6c52ab39161fd9f \
804807
--hash=sha256:0e762e6c5b47ddf30b0128ba723da46fcc2aa7959a252748497492cb452d1c84 \

src/olympia/addons/tests/test_utils_.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@
3737
('Firefox add-on for Firefox', True, True),
3838
('Foobarfor Firefox', False, False),
3939
('F i r e f o x', False, False),
40+
('F î r \u0435fox', False, False),
41+
('FiRѐF0 x', False, False),
42+
('F i r \u0435 f o x', False, False),
43+
('F\u2800i r e f o x', False, False),
44+
('F\u2800 i r \u0435 f o x', False, False),
4045
('Fïrefox is great', False, False),
4146
('Foobarfor Firefox!', False, False),
4247
('Better Privacy for Firefox!', True, False),

src/olympia/addons/utils.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from olympia import amo, core
1010
from olympia.access.acl import action_allowed_for
1111
from olympia.amo.utils import (
12+
generate_lowercase_homoglyphs_variants_for_string,
1213
normalize_string_for_name_checks,
1314
verify_condition_with_locales,
1415
)
@@ -30,24 +31,27 @@ def verify_mozilla_trademark(name, user, *, form=None):
3031
)
3132

3233
def _check(name):
33-
fully_normalized_name = normalize_string_for_name_checks(name).lower()
3434
name_without_punctuation = normalize_string_for_name_checks(
3535
name, categories_to_strip=('P')
3636
).lower()
3737

38-
for symbol in amo.MOZILLA_TRADEMARK_SYMBOLS:
39-
symbol_count = fully_normalized_name.count(symbol)
40-
violates_trademark = symbol_count > 1 or (
41-
symbol_count >= 1
42-
and not name_without_punctuation.endswith(f' for {symbol}')
43-
)
38+
variants = generate_lowercase_homoglyphs_variants_for_string(
39+
normalize_string_for_name_checks(name)
40+
)
41+
42+
for variant in variants:
43+
for symbol in amo.MOZILLA_TRADEMARK_SYMBOLS:
44+
symbol_count = variant.count(symbol)
45+
violates_trademark = symbol_count > 1 or (
46+
symbol_count >= 1
47+
and not name_without_punctuation.endswith(f' for {symbol}')
48+
)
4449

45-
if violates_trademark:
46-
raise forms.ValidationError(
47-
gettext(
50+
if violates_trademark:
51+
msg = gettext(
4852
'Add-on names cannot contain the Mozilla or Firefox trademarks.'
4953
)
50-
)
54+
raise forms.ValidationError(msg)
5155

5256
if not skip_trademark_check:
5357
verify_condition_with_locales(

src/olympia/amo/tests/test_utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
create_signed_url_for_file_backup,
3030
download_file_contents_from_backup_storage,
3131
extract_colors_from_image,
32+
generate_lowercase_homoglyphs_variants_for_string,
3233
get_locale_from_lang,
3334
id_to_path,
3435
is_safe_url,
@@ -453,6 +454,24 @@ def test_normalize_string_for_name_checks_with_specific_category(value, expected
453454
)
454455

455456

457+
@pytest.mark.parametrize(
458+
'value, expected',
459+
[
460+
('aBc', {'abc'}),
461+
('\u0430bc', {'abc'}),
462+
('l\u04300', {'iao', 'lao'}),
463+
('𝐪1lt', {'qiit', 'qilt', 'qlit', 'qllt'}),
464+
('bеta', {'beta'}),
465+
('қѺʍѕ', {'koms'}),
466+
('Zoom', {'zoom'}),
467+
('ТЕСТ0n1𝓀', {'tectonik', 'tectonlk'}),
468+
('ТЄСТ0n1к', {'tectonik', 'tectonlk'}),
469+
],
470+
)
471+
def test_generate_lowercase_homoglyphs_variants_for_string(value, expected):
472+
assert generate_lowercase_homoglyphs_variants_for_string(value) == expected
473+
474+
456475
@pytest.mark.parametrize(
457476
'value, expected',
458477
[

src/olympia/amo/utils.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646

4747
import basket
4848
import colorgram
49+
import homoglyphs_fork
4950
import html5lib
5051
import markupsafe
5152
import pytz
@@ -494,6 +495,37 @@ def normalize_string_for_name_checks(
494495
return value
495496

496497

498+
def generate_lowercase_homoglyphs_variants_for_string(value):
499+
"""Generates a set of lowercase homoglyph variants for a given string.
500+
501+
Value passed as argument is a string that is expected to have gone through
502+
normalization to remove characters we don't want first, see
503+
normalize_string_for_name_checks().
504+
"""
505+
# These are not normally considered confusables, but we think they should.
506+
additional_replacements = {
507+
'e': ('Э', '℈', 'Є', '€', 'Ꞓ'),
508+
'k': ('ĸ', 'κ', 'к', 'қ', 'ҝ', 'ҟ', 'ҡ', 'ᴋ'),
509+
'm': ('ʍ', 'м', 'ᴍ'),
510+
'o': ('Ѻ', 'ѻ'),
511+
}
512+
additional_replacement_table = dict(
513+
itertools.chain(
514+
*(
515+
list(zip(map(ord, letters), itertools.repeat(ord(replacement))))
516+
for replacement, letters in additional_replacements.items()
517+
)
518+
)
519+
)
520+
value = value.translate(additional_replacement_table)
521+
homoglyphs = homoglyphs_fork.Homoglyphs(
522+
languages={'en'},
523+
strategy=homoglyphs_fork.STRATEGY_LOAD,
524+
ascii_range=range(ord('A'), ord('z') + 1),
525+
)
526+
return {variant.lower() for variant in homoglyphs.to_ascii(value)}
527+
528+
497529
def slug_validator(slug, message=validate_slug.message):
498530
"""
499531
Raise an error if the string has any punctuation characters.

src/olympia/scanners/tasks.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@
1818
from olympia.addons.models import Addon
1919
from olympia.amo.celery import create_chunked_tasks_signatures, task
2020
from olympia.amo.decorators import use_primary_db
21-
from olympia.amo.utils import attach_trans_dict, normalize_string_for_name_checks
21+
from olympia.amo.utils import (
22+
attach_trans_dict,
23+
generate_lowercase_homoglyphs_variants_for_string,
24+
normalize_string_for_name_checks,
25+
)
2226
from olympia.constants.scanners import (
2327
ABORTED,
2428
ABORTING,
@@ -228,10 +232,21 @@ def _run_narc(*, version, run_action_on_match):
228232
),
229233
):
230234
value = str(value)
231-
variants = [(value, False)]
235+
variants = [(value, None)]
232236
if (normalized_value := normalize_string_for_name_checks(value)) != value:
233-
variants.append((normalized_value, True))
234-
for variant, is_normalized in variants:
237+
variants.append((normalized_value, 'normalized'))
238+
homoglyph_variants = generate_lowercase_homoglyphs_variants_for_string(
239+
normalized_value
240+
)
241+
if homoglyph_variants:
242+
variants.extend(
243+
(homoglyph_variant, 'homoglyph')
244+
for homoglyph_variant in homoglyph_variants
245+
if homoglyph_variant != value.lower()
246+
and homoglyph_variant != normalized_value.lower()
247+
)
248+
249+
for variant, variant_type in variants:
235250
if match := definition.search(variant):
236251
result = {
237252
'rule': rule.name,
@@ -243,8 +258,8 @@ def _run_narc(*, version, run_action_on_match):
243258
'span': match.span(),
244259
},
245260
}
246-
if is_normalized:
247-
result['meta']['is_normalized'] = True
261+
if variant_type is not None:
262+
result['meta']['variant'] = variant_type
248263
result['meta']['original_string'] = value
249264
results.add(json.dumps(result, sort_keys=True))
250265

src/olympia/scanners/tests/test_tasks.py

Lines changed: 115 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ def test_run(self, incr_mock):
292292
assert narc_result.results == [
293293
{
294294
'meta': {
295-
'is_normalized': True,
295+
'variant': 'normalized',
296296
'locale': 'en-us',
297297
'pattern': '.*',
298298
'source': 'db_addon',
@@ -307,44 +307,44 @@ def test_run(self, incr_mock):
307307
},
308308
{
309309
'meta': {
310-
'is_normalized': True,
311-
'locale': None,
310+
'locale': 'en-us',
312311
'pattern': '.*',
313-
'source': 'author',
312+
'source': 'db_addon',
314313
'span': [
315314
0,
316-
3,
315+
27,
317316
],
318-
'string': 'Foo',
319-
'original_string': 'Fôo',
317+
'string': 'My Fancy WebExtension Addon',
320318
},
321319
'rule': 'always_match_rule',
322320
},
323321
{
324322
'meta': {
325-
'is_normalized': True,
323+
'variant': 'normalized',
326324
'locale': None,
327325
'pattern': '.*',
328-
'source': 'xpi',
326+
'source': 'author',
329327
'span': [
330328
0,
331-
19,
329+
3,
332330
],
333-
'string': 'MyWebExtensionAddon',
334-
'original_string': 'My WebExtension Addon',
331+
'string': 'Foo',
332+
'original_string': 'Fôo',
335333
},
336334
'rule': 'always_match_rule',
337335
},
338336
{
339337
'meta': {
340-
'locale': 'en-us',
338+
'variant': 'normalized',
339+
'locale': None,
341340
'pattern': '.*',
342-
'source': 'db_addon',
341+
'source': 'xpi',
343342
'span': [
344343
0,
345-
27,
344+
19,
346345
],
347-
'string': 'My Fancy WebExtension Addon',
346+
'string': 'MyWebExtensionAddon',
347+
'original_string': 'My WebExtension Addon',
348348
},
349349
'rule': 'always_match_rule',
350350
},
@@ -379,6 +379,104 @@ def test_run(self, incr_mock):
379379
]
380380
)
381381

382+
@mock.patch('olympia.scanners.tasks.statsd.incr')
383+
def test_run_normalized_match(self, incr_mock):
384+
self.addon.name = 'My\u2800 Fäncy WebExtension 𝕒ddon'
385+
self.addon.save()
386+
387+
rule = ScannerRule.objects.create(
388+
name='always_match_rule',
389+
scanner=NARC,
390+
definition='MyFancyWebExtensionAddon',
391+
)
392+
393+
run_narc_on_version(self.version.pk)
394+
395+
scanner_results = ScannerResult.objects.all()
396+
assert len(scanner_results) == 1
397+
narc_result = scanner_results[0]
398+
assert narc_result.scanner == NARC
399+
assert narc_result.upload is None
400+
assert narc_result.version == self.version
401+
assert narc_result.has_matches
402+
assert list(narc_result.matched_rules.all()) == [rule]
403+
assert len(narc_result.results) == 1
404+
assert narc_result.results == [
405+
{
406+
'meta': {
407+
'locale': 'en-us',
408+
'original_string': 'My⠀ Fäncy WebExtension 𝕒ddon',
409+
'pattern': 'MyFancyWebExtensionAddon',
410+
'source': 'db_addon',
411+
'span': [
412+
0,
413+
24,
414+
],
415+
'string': 'MyFancyWebExtensionaddon',
416+
'variant': 'normalized',
417+
},
418+
'rule': 'always_match_rule',
419+
},
420+
]
421+
assert incr_mock.called
422+
assert incr_mock.call_count == 3
423+
incr_mock.assert_has_calls(
424+
[
425+
mock.call('devhub.narc.has_matches'),
426+
mock.call(f'devhub.narc.rule.{rule.id}.match'),
427+
mock.call('devhub.narc.success'),
428+
]
429+
)
430+
431+
@mock.patch('olympia.scanners.tasks.statsd.incr')
432+
def test_run_homoglyph_match(self, incr_mock):
433+
self.addon.name = 'My\u2800 Fäncy W\u0435bExtѐnsion addon'
434+
self.addon.save()
435+
436+
rule = ScannerRule.objects.create(
437+
name='always_match_rule',
438+
scanner=NARC,
439+
definition='MyFancyWebExtensionAddon',
440+
)
441+
442+
run_narc_on_version(self.version.pk)
443+
444+
scanner_results = ScannerResult.objects.all()
445+
assert len(scanner_results) == 1
446+
narc_result = scanner_results[0]
447+
assert narc_result.scanner == NARC
448+
assert narc_result.upload is None
449+
assert narc_result.version == self.version
450+
assert narc_result.has_matches
451+
assert list(narc_result.matched_rules.all()) == [rule]
452+
assert len(narc_result.results) == 1
453+
assert narc_result.results == [
454+
{
455+
'meta': {
456+
'locale': 'en-us',
457+
'original_string': 'My⠀ Fäncy WеbExtѐnsion addon',
458+
'pattern': 'MyFancyWebExtensionAddon',
459+
'source': 'db_addon',
460+
'span': [
461+
0,
462+
24,
463+
],
464+
'string': 'myfancywebextensionaddon',
465+
'variant': 'homoglyph',
466+
},
467+
'rule': 'always_match_rule',
468+
},
469+
]
470+
assert incr_mock.called
471+
assert incr_mock.call_count == 3
472+
incr_mock.assert_has_calls(
473+
[
474+
mock.call('devhub.narc.has_matches'),
475+
mock.call(f'devhub.narc.rule.{rule.id}.match'),
476+
mock.call('devhub.narc.success'),
477+
]
478+
)
479+
382480
@mock.patch('olympia.scanners.tasks.statsd.timer')
383481
def test_calls_statsd_timer(self, timer_mock):
384482
run_narc_on_version(self.version.pk)
@@ -515,7 +613,7 @@ def test_run_multiple_authors_match(self, incr_mock):
515613
assert narc_result.results == [
516614
{
517615
'meta': {
518-
'is_normalized': True,
616+
'variant': 'normalized',
519617
'span': [0, 3],
520618
'locale': None,
521619
'source': 'author',

0 commit comments

Comments
 (0)