Skip to content

Commit ba6e893

Browse files
committed
Add additional email normalization
1 parent 59d33c6 commit ba6e893

File tree

3 files changed

+296
-17
lines changed

3 files changed

+296
-17
lines changed

HISTORY.rst

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,31 @@ History
99
* Added the following new values to the ``/payment/processor`` validation:
1010
* ``pxp_financial``
1111
* ``trustpay``
12+
* Equivalent domain names are now normalized when ``hash_address`` is used.
13+
For example, ``googlemail.com`` will become ``gmail.com``.
14+
* Periods are now removed from ``gmail.com`` email address local parts when
15+
``hash_address`` is used. For example, ``f.o.o@gmail.com`` will become
16+
``foo@gmail.com``.
17+
* Fastmail alias subdomain email addresses are now normalized when
18+
``hash_address`` is used. For example, ``alias@user.fastmail.com`` will
19+
become ``user@fastmail.com``.
20+
* Additional ``yahoo.com`` email addresses now have aliases removed from
21+
their local part when ``hash_address`` is used. For example,
22+
``foo-bar@yahoo.com`` will become ``foo@yahoo.com`` for additional
23+
``yahoo.com`` domains.
24+
* Duplicate ``.com``s are now removed from email domain names when
25+
``hash_address`` is used. For example, ``example.com.com`` will become
26+
``example.com``.
27+
* Extraneous characters after ``.com`` are now removed from email domain
28+
names when ``hash_address`` is used. For example, ``example.comfoo`` will
29+
become ``example.com``.
30+
* Certain ``.com`` typos are now normalized to ``.com`` when ``hash_address`` is
31+
used. For example, ``example.cam`` will become ``example.com``.
32+
* Additional ``gmail.com`` domain names with leading digits are now
33+
normalized when ``hash_address`` is used. For example, ``100gmail.com`` will
34+
become ``gmail.com``.
35+
* Additional ``gmail.com`` typos are now normalized when ``hash_address`` is
36+
used. For example, ``gmali.com`` will become ``gmail.com``.
1237

1338
2.9.0 (2023-12-05)
1439
++++++++++++++++++

minfraud/request.py

Lines changed: 231 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
66
"""
77

8+
import re
89
import warnings
910
import hashlib
1011
from typing import Any, Dict
@@ -15,17 +16,207 @@
1516

1617
_TYPO_DOMAINS = {
1718
# gmail.com
18-
"35gmai.com": "gmail.com",
19-
"636gmail.com": "gmail.com",
19+
"gmai.com": "gmail.com",
2020
"gamil.com": "gmail.com",
21-
"gmail.comu": "gmail.com",
21+
"gmali.com": "gmail.com",
2222
"gmial.com": "gmail.com",
2323
"gmil.com": "gmail.com",
24+
"gmaill.com": "gmail.com",
25+
"gmailm.com": "gmail.com",
26+
"gmailo.com": "gmail.com",
27+
"gmailyhoo.com": "gmail.com",
2428
"yahoogmail.com": "gmail.com",
2529
# outlook.com
2630
"putlook.com": "outlook.com",
2731
}
2832

33+
_EQUIVALENT_DOMAINS = {
34+
"googlemail.com": "gmail.com",
35+
"pm.me": "protonmail.com",
36+
"proton.me": "protonmail.com",
37+
"yandex.by": "yandex.ru",
38+
"yandex.com": "yandex.ru",
39+
"yandex.kz": "yandex.ru",
40+
"yandex.ua": "yandex.ru",
41+
"ya.ru": "yandex.ru",
42+
}
43+
44+
_FASTMAIL_DOMAINS = {
45+
"123mail.org",
46+
"150mail.com",
47+
"150ml.com",
48+
"16mail.com",
49+
"2-mail.com",
50+
"4email.net",
51+
"50mail.com",
52+
"airpost.net",
53+
"allmail.net",
54+
"bestmail.us",
55+
"cluemail.com",
56+
"elitemail.org",
57+
"emailcorner.net",
58+
"emailengine.net",
59+
"emailengine.org",
60+
"emailgroups.net",
61+
"emailplus.org",
62+
"emailuser.net",
63+
"eml.cc",
64+
"f-m.fm",
65+
"fast-email.com",
66+
"fast-mail.org",
67+
"fastem.com",
68+
"fastemail.us",
69+
"fastemailer.com",
70+
"fastest.cc",
71+
"fastimap.com",
72+
"fastmail.cn",
73+
"fastmail.co.uk",
74+
"fastmail.com",
75+
"fastmail.com.au",
76+
"fastmail.de",
77+
"fastmail.es",
78+
"fastmail.fm",
79+
"fastmail.fr",
80+
"fastmail.im",
81+
"fastmail.in",
82+
"fastmail.jp",
83+
"fastmail.mx",
84+
"fastmail.net",
85+
"fastmail.nl",
86+
"fastmail.org",
87+
"fastmail.se",
88+
"fastmail.to",
89+
"fastmail.tw",
90+
"fastmail.uk",
91+
"fastmail.us",
92+
"fastmailbox.net",
93+
"fastmessaging.com",
94+
"fea.st",
95+
"fmail.co.uk",
96+
"fmailbox.com",
97+
"fmgirl.com",
98+
"fmguy.com",
99+
"ftml.net",
100+
"h-mail.us",
101+
"hailmail.net",
102+
"imap-mail.com",
103+
"imap.cc",
104+
"imapmail.org",
105+
"inoutbox.com",
106+
"internet-e-mail.com",
107+
"internet-mail.org",
108+
"internetemails.net",
109+
"internetmailing.net",
110+
"jetemail.net",
111+
"justemail.net",
112+
"letterboxes.org",
113+
"mail-central.com",
114+
"mail-page.com",
115+
"mailandftp.com",
116+
"mailas.com",
117+
"mailbolt.com",
118+
"mailc.net",
119+
"mailcan.com",
120+
"mailforce.net",
121+
"mailftp.com",
122+
"mailhaven.com",
123+
"mailingaddress.org",
124+
"mailite.com",
125+
"mailmight.com",
126+
"mailnew.com",
127+
"mailsent.net",
128+
"mailservice.ms",
129+
"mailup.net",
130+
"mailworks.org",
131+
"ml1.net",
132+
"mm.st",
133+
"myfastmail.com",
134+
"mymacmail.com",
135+
"nospammail.net",
136+
"ownmail.net",
137+
"petml.com",
138+
"postinbox.com",
139+
"postpro.net",
140+
"proinbox.com",
141+
"promessage.com",
142+
"realemail.net",
143+
"reallyfast.biz",
144+
"reallyfast.info",
145+
"rushpost.com",
146+
"sent.as",
147+
"sent.at",
148+
"sent.com",
149+
"speedpost.net",
150+
"speedymail.org",
151+
"ssl-mail.com",
152+
"swift-mail.com",
153+
"the-fastest.net",
154+
"the-quickest.com",
155+
"theinternetemail.com",
156+
"veryfast.biz",
157+
"veryspeedy.net",
158+
"warpmail.net",
159+
"xsmail.com",
160+
"yepmail.net",
161+
"your-mail.com",
162+
}
163+
164+
_YAHOO_DOMAINS = {
165+
"y7mail.com",
166+
"yahoo.at",
167+
"yahoo.be",
168+
"yahoo.bg",
169+
"yahoo.ca",
170+
"yahoo.cl",
171+
"yahoo.co.id",
172+
"yahoo.co.il",
173+
"yahoo.co.in",
174+
"yahoo.co.kr",
175+
"yahoo.co.nz",
176+
"yahoo.co.th",
177+
"yahoo.co.uk",
178+
"yahoo.co.za",
179+
"yahoo.com",
180+
"yahoo.com.ar",
181+
"yahoo.com.au",
182+
"yahoo.com.br",
183+
"yahoo.com.co",
184+
"yahoo.com.hk",
185+
"yahoo.com.hr",
186+
"yahoo.com.mx",
187+
"yahoo.com.my",
188+
"yahoo.com.pe",
189+
"yahoo.com.ph",
190+
"yahoo.com.sg",
191+
"yahoo.com.tr",
192+
"yahoo.com.tw",
193+
"yahoo.com.ua",
194+
"yahoo.com.ve",
195+
"yahoo.com.vn",
196+
"yahoo.cz",
197+
"yahoo.de",
198+
"yahoo.dk",
199+
"yahoo.ee",
200+
"yahoo.es",
201+
"yahoo.fi",
202+
"yahoo.fr",
203+
"yahoo.gr",
204+
"yahoo.hu",
205+
"yahoo.ie",
206+
"yahoo.in",
207+
"yahoo.it",
208+
"yahoo.lt",
209+
"yahoo.lv",
210+
"yahoo.nl",
211+
"yahoo.no",
212+
"yahoo.pl",
213+
"yahoo.pt",
214+
"yahoo.ro",
215+
"yahoo.se",
216+
"yahoo.sk",
217+
"ymail.com",
218+
}
219+
29220

30221
def prepare_report(request: Dict[str, Any], validate: bool):
31222
"""Validate and prepare minFraud report"""
@@ -91,29 +282,42 @@ def maybe_hash_email(transaction):
91282
if address is None:
92283
return
93284

94-
address = address.lower().strip()
95-
96-
at_idx = address.rfind("@")
97-
if at_idx == -1:
285+
address, domain = _clean_email(address)
286+
if not address:
98287
return
99288

100-
domain = _clean_domain(address[at_idx + 1 :]) # noqa
101-
local_part = address[:at_idx]
102-
103289
if domain != "" and "domain" not in email:
104290
email["domain"] = domain
105291

106-
email["address"] = _hash_email(local_part, domain)
292+
email["address"] = hashlib.md5(address.encode("UTF-8")).hexdigest()
107293

108294

109295
def _clean_domain(domain):
110296
domain = domain.strip().rstrip(".").encode("idna").decode("ASCII")
111-
return _TYPO_DOMAINS.get(domain, domain)
112297

298+
domain = re.sub(r"(?:\.com){2,}$", ".com", domain)
299+
domain = re.sub(r"\.com[^.]+$", ".com", domain)
300+
domain = re.sub(r"(?:\.(?:com|c[a-z]{1,2}m|co[ln]|[dsvx]o[mn]|))$", ".com", domain)
301+
domain = re.sub(r"^\d+(?:gmail?\.com)$", "gmail.com", domain)
113302

114-
def _hash_email(local_part, domain):
115-
# Strip off aliased part of email address
116-
if domain == "yahoo.com":
303+
domain = _TYPO_DOMAINS.get(domain, domain)
304+
domain = _EQUIVALENT_DOMAINS.get(domain, domain)
305+
306+
return domain
307+
308+
309+
def _clean_email(address):
310+
address = address.lower().strip()
311+
312+
at_idx = address.rfind("@")
313+
if at_idx == -1:
314+
return None, None
315+
316+
domain = _clean_domain(address[at_idx + 1 :]) # noqa
317+
local_part = address[:at_idx]
318+
319+
# Strip off aliased part of email address.
320+
if domain in _YAHOO_DOMAINS:
117321
divider = "-"
118322
else:
119323
divider = "+"
@@ -122,4 +326,15 @@ def _hash_email(local_part, domain):
122326
if alias_idx > 0:
123327
local_part = local_part[:alias_idx]
124328

125-
return hashlib.md5(f"{local_part}@{domain}".encode("UTF-8")).hexdigest()
329+
if domain == "gmail.com":
330+
local_part = local_part.replace(".", "")
331+
332+
domain_parts = domain.split(".")
333+
if len(domain_parts) > 2:
334+
possible_domain = ".".join(domain_parts[1:])
335+
if possible_domain in _FASTMAIL_DOMAINS:
336+
domain = possible_domain
337+
if local_part != "":
338+
local_part = domain_parts[0]
339+
340+
return f"{local_part}@{domain}", domain

tests/test_request.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import unittest
22

3-
from minfraud.request import maybe_hash_email, clean_credit_card
3+
from minfraud.request import (
4+
maybe_hash_email,
5+
clean_credit_card,
6+
_clean_email,
7+
)
48

59

610
class TestRequest(unittest.TestCase):
@@ -191,3 +195,38 @@ def test_clean_credit_card(self):
191195
clean_credit_card(transaction)
192196

193197
self.assertEqual(test["expected"], transaction)
198+
199+
200+
def test_clean_email():
201+
tests = [
202+
{"input": "", "output": None},
203+
{"input": "fasfs", "output": None},
204+
{"input": "test@gmail", "output": "test@gmail"},
205+
{"input": "e4d909c290d0fb1ca068ffaddf22cbd0", "output": None},
206+
{"input": "Test@maxmind", "output": "test@maxmind"},
207+
{"input": "Test@maxmind.com", "output": "test@maxmind.com"},
208+
{"input": "Test+007@maxmind.com", "output": "test@maxmind.com"},
209+
{"input": "Test+007+008@maxmind.com", "output": "test@maxmind.com"},
210+
{"input": "Test+@maxmind.com", "output": "test@maxmind.com"},
211+
{"input": "+@maxmind.com", "output": "+@maxmind.com"},
212+
{"input": " Test@maxmind.com", "output": "test@maxmind.com"},
213+
{"input": "Test@maxmind.com|abc124472372", "output": "test@maxmind.com"},
214+
{"input": "Test+foo@yahoo.com", "output": "test+foo@yahoo.com"},
215+
{"input": "Test-foo@yahoo.com", "output": "test@yahoo.com"},
216+
{"input": "Test-foo-foo2@yahoo.com", "output": "test@yahoo.com"},
217+
{"input": "Test-foo@gmail.com", "output": "test-foo@gmail.com"},
218+
{"input": "gamil.com@gamil.com", "output": "gamilcom@gmail.com"},
219+
{"input": "Test+alias@bücher.com", "output": "test@xn--bcher-kva.com"},
220+
{"input": "foo@googlemail.com", "output": "foo@gmail.com"},
221+
{"input": "foo.bar@gmail.com", "output": "foobar@gmail.com"},
222+
{"input": "alias@user.fastmail.com", "output": "user@fastmail.com"},
223+
{"input": "foo-bar@ymail.com", "output": "foo@ymail.com"},
224+
{"input": "foo@example.com.com", "output": "foo@example.com"},
225+
{"input": "foo@example.comfoo", "output": "foo@example.com"},
226+
{"input": "foo@example.cam", "output": "foo@example.com"},
227+
{"input": "foo@10000gmail.com", "output": "foo@gmail.com"},
228+
]
229+
230+
for test in tests:
231+
got, _ = _clean_email(test["input"])
232+
assert test["output"] == got

0 commit comments

Comments
 (0)