Skip to content

Commit bc4adbc

Browse files
committed
minor changes in parser Intents() class
- parser Intents() match_order() now accepts preferrences in reverse order - typo_prefix and typo_affix now accepts matches where the prefixes/affixes are separated with a space from the stem - fixed a bug where typo_stem would be empty, thus matching all texts - added Mézga Géza reference to entities - increased version number to 1.1.7
1 parent 1310b70 commit bc4adbc

File tree

3 files changed

+17
-8
lines changed

3 files changed

+17
-8
lines changed

lara/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Lara - Lingusitic Aim Recognizer API
44

55
__all__ = 'nlp','parser','stemmer','entities'
6-
__version__ = '1.1.6'
6+
__version__ = '1.1.7'
77
__version_info__ = tuple(int(num) for num in __version__.split('.'))
88

99
import sys

lara/entities.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def commands():
4545
"redo" : [{"stem":"mégis"},{"stem":"(meg)ism[eé]t(l[eé]s|el(je)?d?)?","wordclass":"regex"},{"stem":"el[oöő]rel[eé]p([eé]s)?","wordclass":"regex"},{"stem":"l[eé]p.+?el[oöő]re","wordclass":"regex"},{"stem":"redo"},{"stem":"m[eé]gse.+?von.+?vis+za","wordclass":"regex"}],
4646
"restart" : [{"stem":"ind[ií][ct]+sa?d?(\sel)?(\s[uú]j(ra|b[oó]l))","wordclass":"regex"},{"stem":"újraindít","wordclass":"verb"},{"stem":"(([uú]jra)?kezd\w{0,5}|kezd\w{0,5}.+?([uú]jra|el[oöő]l?r[oö]l|(leg)?elej[eé](t|r)[oöő]l))","wordclass":"regex"}],
4747
"play" : [{"stem":"(le)?j[aá](ts+z|c+)([aá]([dls]|ni))?(\sle)?(\svalamit?)?(\segy)?","wordclass":"regex"},{"stem":"play"},{"stem":"indít","wordclass":"verb","prefix":["el"],"exc":[{"stem":"újra"}]}],
48-
"stop" : [{"stem":"(meg|le)?[aá]l+(j+([aá]l)?|[ií][ct]+(s?a?d|sa|[aá](ni|s)))(\smeg|\sle)?","wordclass":"regex"},{"stem":"stop"},{"stem":"el[eé]g(\sis)?(\sle(sz|gyen))?(\sm[aá]r)?(\smost)?(\sen+yi)?","wordclass":"regex"},{"stem":"(kus+(olj([aá]l)?)?|fog(ja)?d\s?be)","wordclass":"regex"},{"stem":"(s+h{2,})|(p+s+z*t+)","wordclass":"regex"}],
48+
"stop" : [{"stem":"(meg|le)?[aá]l+(j+([aá]l)?|[ií][ct]+(s?a?d|sa|[aá](ni|s)))(\smeg|\sle)?","wordclass":"regex"},{"stem":"stop"},{"stem":"el[eé]g(\sis)?(\sle(sz|gyen))?(\sm[aá]r)?(\smost)?(\sen+yi)?","wordclass":"regex"},{"stem":"(kus+(olj([aá]l)?)?|fog(ja)?d\s?be)","wordclass":"regex"},{"stem":"(s+h{2,})|(p+s+z*t+)","wordclass":"regex"},{"stem":"a[bp]+\s?h?agy","wordclass":"regex"}],
4949
"pause" : [{"stem":"pau[sz][aáeé]([lz]+((as+a|[jz]a)?[dj]|[jz]a|ni))?(\sle)?","wordclass":"regex"},{"stem":"sz[uü]net(elt?(et)?([eé]?s+e?d?|ni)?)?","wordclass":"regex"}],
5050
"resume" : [{"stem":"folyta\w+","wordclass":"regex"},{"stem":"resume"}],
5151
"skip" : [{"stem":"(kihagy\w+|hag+yj?a?d?\ski|([aá]t|tov[aá]b+)(l[eé]p|ugr[aá])\w*|(ugr[aá]s|ugor\w+|l[eé]p(je)?[dn])\s([aá]t|tov[aá]b+))","wordclass":"regex"},{"stem":"(sz?kip+(el\w*)?|m[aá]sikat)","wordclass":"regex"}],

lara/parser.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ def _generate(self, item):
104104
item['typo_stem'] = item['stem']
105105
else:
106106
item['typo_stem'] = lara.nlp.trim(lara.nlp.strip_accents(lara.nlp.remove_double_letters(item['stem'])))
107+
if not item['typo_stem']:
108+
item['typo_stem'] = lara.nlp.trim(lara.nlp.remove_double_letters(item['stem']))
107109

108110
if 'prefix' not in item:
109111
if item['wordclass'] == 'verb':
@@ -122,13 +124,13 @@ def _generate(self, item):
122124
if 'typo_prefix' not in item:
123125
if isinstance(item['prefix'],list):
124126
typo_prefix = ['(?:'+self._scramble(lara.nlp.trim(lara.nlp.strip_accents(lara.nlp.remove_double_letters(elem))), (item['wordclass'] == 'adjective'))+')' for elem in item['prefix']]
125-
item['typo_prefix'] = r'(?:'+('|'.join(typo_prefix))+r')?'
127+
item['typo_prefix'] = r'(?:'+('|'.join(typo_prefix))+r')?\s?'
126128
else:
127129
item['typo_prefix'] = r''+lara.nlp.trim(lara.nlp.strip_accents(lara.nlp.remove_double_letters(item['prefix'])))
128130
else:
129131
if isinstance(item['typo_prefix'],list):
130132
item['typo_prefix'] = [re.escape(prefix) for prefix in item['typo_prefix']]
131-
item['typo_prefix'] = r'(?:'+('|'.join(item['typo_prefix']))+')?' #prefix?
133+
item['typo_prefix'] = r'(?:'+('|'.join(item['typo_prefix']))+')?\s?'
132134
else:
133135
item['typo_prefix'] = r''+(item['typo_prefix'])
134136
if isinstance(item['prefix'],list):
@@ -142,14 +144,14 @@ def _generate(self, item):
142144
else:
143145
if 'typo_affix' not in item:
144146
if isinstance(item['affix'],list):
145-
typo_affix = ['(?:'+self._scramble(lara.nlp.trim(lara.nlp.strip_accents(lara.nlp.remove_double_letters(elem))), (item['wordclass'] == 'adjective'))+')' for elem in item['affix']]
146-
item['typo_affix'] = r'(?:'+('|'.join(typo_affix))+r')?'
147+
typo_affix = ['(?:'+self._scramble(lara.nlp.trim(lara.nlp.strip_accents(lara.nlp.remove_double_letters(elem))), (item['wordclass'] == 'adjective'))+')' for elem in item['affix']]
148+
item['typo_affix'] = r'\s?(?:'+('|'.join(typo_affix))+r')?'
147149
else:
148150
item['typo_affix'] = r''+lara.nlp.strip_accents(item['affix'])
149151
else:
150152
if isinstance(item['typo_affix'],list):
151153
item['typo_affix'] = [re.escape(affix) for affix in item['typo_affix']]
152-
item['typo_affix'] = r'(?:'+('|'.join(item['typo_affix']))+')?'
154+
item['typo_affix'] = r'\s?(?:'+('|'.join(item['typo_affix']))+')?'
153155
else:
154156
item['typo_affix'] = r''+(item['typo_affix'])
155157
if isinstance(item['affix'],list):
@@ -281,10 +283,17 @@ def match_best(self, text, n=1):
281283
return {}
282284

283285
# Get best match based on preference hierarchy
284-
def match_order(self,text,preference=[]):
286+
def match_order(self,text,preference=[],reverse=False):
285287
if text:
286288
score = self.match(text)
287289
if score:
290+
if reverse:
291+
if max(score, key=score.get) not in preference:
292+
return max(score, key=score.get)
293+
for item in score:
294+
if item not in preference:
295+
return item
296+
preference.reverse()
288297
for item in preference:
289298
if item in score:
290299
return item

0 commit comments

Comments
 (0)