-
Notifications
You must be signed in to change notification settings - Fork 1
/
match_ror.py
79 lines (74 loc) · 3.07 KB
/
match_ror.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import re
from project.server.main.matcher import Matcher
from project.server.main.utils import ENGLISH_STOP, FRENCH_STOP, remove_ref_index
DEFAULT_STRATEGIES = [
[['ror_id']],
[['ror_grid_id']],
[['ror_name', 'ror_supervisor_name', 'ror_acronym', 'ror_city_nuts_level2', 'ror_country'],
['ror_name', 'ror_supervisor_name', 'ror_acronym', 'ror_city_zone_emploi', 'ror_country']
],
[['ror_name', 'ror_acronym', 'ror_city', 'ror_country'],
['ror_name', 'ror_acronym', 'ror_city_zone_emploi', 'ror_country']
],
[['ror_name', 'ror_country', 'ror_web_url']],
[['ror_name', 'ror_supervisor_name', 'ror_city', 'ror_country'],
['ror_name', 'ror_supervisor_name', 'ror_city_zone_emploi', 'ror_country_code']
],
[['ror_name', 'ror_supervisor_name', 'ror_city_nuts_level2', 'ror_country'],
['ror_name', 'ror_supervisor_name', 'ror_city_zone_emploi', 'ror_country']
],
[
['ror_name', 'ror_city', 'ror_country'],
['ror_name', 'ror_city_zone_emploi', 'ror_country'],
],
#[['ror_acronym', 'ror_city', 'ror_country'], ['ror_acronym', 'ror_city', 'ror_country_code']],
[['ror_name', 'ror_acronym', 'ror_city'], ['ror_name', 'ror_acronym', 'ror_country']],
[['ror_web_url', 'ror_country']],
# ['ror_name', 'ror_acronym', 'ror_country_code']
# [['ror_name', 'ror_country'],
# ['ror_name', 'ror_country_code']
# ],
[['ror_name', 'ror_city'], ['ror_name', 'ror_city_nuts_level2'], ['ror_name', 'ror_city_zone_emploi']],
[['ror_web_domain', 'ror_country']]
# ,[['ror_name_unique', 'ror_city_nuts_level2']],
# [['ror_acronym_unique', 'ror_city_nuts_level2']],
# [['ror_name_unique', 'ror_country']],
# [['ror_acronym_unique', 'ror_country']]
]
STOPWORDS_STRATEGIES = {
'ror_name': ENGLISH_STOP + FRENCH_STOP,
'ror_supervisor_name': ENGLISH_STOP + FRENCH_STOP
}
def replace_synonym(query, source, target):
rgx = re.compile("(?i)(" + source + ")( |,)")
return rgx.sub(target+" ", query)
# Done here rather than in synonym settings in ES as they seem to cause highlight bugs
def pre_treatment_ror(query: str = '') -> str:
query = remove_ref_index(query)
# If query starts with a digit that can be a reference index
synonyms = [
['comput.', 'computer'],
['dpt.', 'department'],
['eng.', 'engineering'],
['inst.', 'institute'],
['institut', 'institute'],
['mech.', 'mechanics'],
['sci.', 'sciences'],
['technol.', 'technology'],
['univ.', 'university']
]
for synonym in synonyms:
query = replace_synonym(query, synonym[0], synonym[1])
return query.lower()
def match_ror(conditions: dict) -> dict:
strategies = conditions.get('strategies')
if strategies is None:
strategies = DEFAULT_STRATEGIES
matcher = Matcher()
return matcher.match(
field='rors',
conditions=conditions,
strategies=strategies,
pre_treatment_query=pre_treatment_ror,
stopwords_strategies=STOPWORDS_STRATEGIES,
)