forked from alephdata/ingest-file
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_analysis.py
95 lines (84 loc) · 3.19 KB
/
test_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from followthemoney import model
from followthemoney.types import registry
from ingestors.analysis import Analyzer
from ingestors.analysis.patterns import EMAIL_REGEX, PHONE_REGEX
from ingestors.analysis.patterns import IBAN_REGEX
from .support import TestCase
class TestAnalysis(TestCase):
def _tagged_entity(self, entity):
a = Analyzer(self.manager.dataset, entity, {})
a.feed(entity)
a.flush()
return self.get_emitted_by_id(entity.id)
def test_ner_extract(self):
text = "Das ist der Pudel von Angela Merkel. "
text = text * 5
entity = model.make_entity("PlainText")
entity.id = "test1"
entity.add("bodyText", text)
entity = self._tagged_entity(entity)
names = entity.get_type_values(registry.name)
assert "Angela Merkel" in names, names
def test_language_tagging(self):
text = "C'est le caniche d'Emmanuel Macron. " * 2
entity = model.make_entity("PlainText")
entity.id = "test2"
entity.add("bodyText", text)
entity = self._tagged_entity(entity)
names = entity.get_type_values(registry.name)
assert "Emmanuel Macron" in names, names
assert entity.get("detectedLanguage") == ["fra"], entity.get(
"detectedLanguage"
) # noqa
def test_pattern_extract(self):
text = "Mr. Flubby Flubber called the number tel:+919988111222 twice"
entity = model.make_entity("PlainText")
entity.id = "test3"
entity.add("bodyText", text)
entity = self._tagged_entity(entity)
phones = entity.get_type_values(registry.phone)
assert "+919988111222" in phones
countries = entity.get_type_values(registry.country)
assert "in" in countries
class TestPatterns(TestCase):
def test_phonenumbers(self):
PHONE_NUMBERS = [
"754-3010",
"(541) 754-3010",
"+1-541-754-3010",
"1-541-754-3010",
"001-541-754-3010",
"191 541 754 3010",
"(089) / 636-48018",
"+49-89-636-48018",
"19-49-89-636-48018",
"phone: +49-89-636-48018",
"tel +49-89-636-48018 or so",
]
for number in PHONE_NUMBERS:
matches = PHONE_REGEX.findall(number)
assert len(matches) == 1
def test_iban(self):
IBANS = [
"SC52BAHL01031234567890123456USD",
"SK8975000000000012345671",
"SI56192001234567892",
"ES7921000813610123456789",
"SE1412345678901234567890",
"CH5604835012345678009",
"TL380080012345678910157",
"TN4401000067123456789123",
"TR320010009999901234567890",
"UA903052992990004149123456789",
"AE460090000000123456789",
"GB98MIDL07009312345678",
"VG21PACG0000000123456789",
]
for iban in IBANS:
matches = IBAN_REGEX.findall(iban)
assert len(matches) == 1
def test_email(self):
EMAILS = ["abc@sunu.in", "abc+netflix@sunu.in", "_@sunu.in"]
for email in EMAILS:
matches = EMAIL_REGEX.findall(email)
assert len(matches) == 1