minor changes in parser Intents() class

sedthh · sedthh · commit bc4adbc4765f · 2018-05-15T16:22:51.000+02:00
- parser Intents() match_order() now accepts preferrences in reverse order
- typo_prefix and typo_affix now accepts matches where the prefixes/affixes are separated with a space from the stem
- fixed a bug where typo_stem would be empty, thus matching all texts
- added Mézga Géza reference to entities
- increased version number to 1.1.7
diff --git a/lara/__init__.py b/lara/__init__.py
@@ -3,7 +3,7 @@
 # Lara - Lingusitic Aim Recognizer API
 
 __all__				= 'nlp','parser','stemmer','entities'
-__version__ 		= '1.1.6'
+__version__ 		= '1.1.7'
 __version_info__	= tuple(int(num) for num in __version__.split('.'))
 
 import sys
diff --git a/lara/entities.py b/lara/entities.py
@@ -45,7 +45,7 @@ def commands():
 		"redo"			: [{"stem":"mégis"},{"stem":"(meg)ism[eé]t(l[eé]s|el(je)?d?)?","wordclass":"regex"},{"stem":"el[oöő]rel[eé]p([eé]s)?","wordclass":"regex"},{"stem":"l[eé]p.+?el[oöő]re","wordclass":"regex"},{"stem":"redo"},{"stem":"m[eé]gse.+?von.+?vis+za","wordclass":"regex"}],
 		"restart"		: [{"stem":"ind[ií][ct]+sa?d?(\sel)?(\s[uú]j(ra|b[oó]l))","wordclass":"regex"},{"stem":"újraindít","wordclass":"verb"},{"stem":"(([uú]jra)?kezd\w{0,5}|kezd\w{0,5}.+?([uú]jra|el[oöő]l?r[oö]l|(leg)?elej[eé](t|r)[oöő]l))","wordclass":"regex"}],
 		"play"			: [{"stem":"(le)?j[aá](ts+z|c+)([aá]([dls]|ni))?(\sle)?(\svalamit?)?(\segy)?","wordclass":"regex"},{"stem":"play"},{"stem":"indít","wordclass":"verb","prefix":["el"],"exc":[{"stem":"újra"}]}],
-		"stop"			: [{"stem":"(meg|le)?[aá]l+(j+([aá]l)?|[ií][ct]+(s?a?d|sa|[aá](ni|s)))(\smeg|\sle)?","wordclass":"regex"},{"stem":"stop"},{"stem":"el[eé]g(\sis)?(\sle(sz|gyen))?(\sm[aá]r)?(\smost)?(\sen+yi)?","wordclass":"regex"},{"stem":"(kus+(olj([aá]l)?)?|fog(ja)?d\s?be)","wordclass":"regex"},{"stem":"(s+h{2,})|(p+s+z*t+)","wordclass":"regex"}],
+		"stop"			: [{"stem":"(meg|le)?[aá]l+(j+([aá]l)?|[ií][ct]+(s?a?d|sa|[aá](ni|s)))(\smeg|\sle)?","wordclass":"regex"},{"stem":"stop"},{"stem":"el[eé]g(\sis)?(\sle(sz|gyen))?(\sm[aá]r)?(\smost)?(\sen+yi)?","wordclass":"regex"},{"stem":"(kus+(olj([aá]l)?)?|fog(ja)?d\s?be)","wordclass":"regex"},{"stem":"(s+h{2,})|(p+s+z*t+)","wordclass":"regex"},{"stem":"a[bp]+\s?h?agy","wordclass":"regex"}],
 		"pause"			: [{"stem":"pau[sz][aáeé]([lz]+((as+a|[jz]a)?[dj]|[jz]a|ni))?(\sle)?","wordclass":"regex"},{"stem":"sz[uü]net(elt?(et)?([eé]?s+e?d?|ni)?)?","wordclass":"regex"}],
 		"resume"		: [{"stem":"folyta\w+","wordclass":"regex"},{"stem":"resume"}],
 		"skip"			: [{"stem":"(kihagy\w+|hag+yj?a?d?\ski|([aá]t|tov[aá]b+)(l[eé]p|ugr[aá])\w*|(ugr[aá]s|ugor\w+|l[eé]p(je)?[dn])\s([aá]t|tov[aá]b+))","wordclass":"regex"},{"stem":"(sz?kip+(el\w*)?|m[aá]sikat)","wordclass":"regex"}],
diff --git a/lara/parser.py b/lara/parser.py
@@ -104,6 +104,8 @@ def _generate(self, item):
 				item['typo_stem']	= item['stem']
 			else:
 				item['typo_stem']	= lara.nlp.trim(lara.nlp.strip_accents(lara.nlp.remove_double_letters(item['stem'])))
+				if not item['typo_stem']:
+					item['typo_stem']	= lara.nlp.trim(lara.nlp.remove_double_letters(item['stem']))
 		
 		if 'prefix' not in item:
 			if item['wordclass']	== 'verb':
@@ -122,13 +124,13 @@ def _generate(self, item):
 			if 'typo_prefix' not in item:
 				if isinstance(item['prefix'],list):
 					typo_prefix			= ['(?:'+self._scramble(lara.nlp.trim(lara.nlp.strip_accents(lara.nlp.remove_double_letters(elem))), (item['wordclass'] == 'adjective'))+')' for elem in item['prefix']]
-					item['typo_prefix']	= r'(?:'+('|'.join(typo_prefix))+r')?'
+					item['typo_prefix']	= r'(?:'+('|'.join(typo_prefix))+r')?\s?'
 				else:
 					item['typo_prefix']	= r''+lara.nlp.trim(lara.nlp.strip_accents(lara.nlp.remove_double_letters(item['prefix'])))
 			else:
 				if isinstance(item['typo_prefix'],list):
 					item['typo_prefix']	=  [re.escape(prefix) for prefix in item['typo_prefix']]
-					item['typo_prefix']	= r'(?:'+('|'.join(item['typo_prefix']))+')?' #prefix?
+					item['typo_prefix']	= r'(?:'+('|'.join(item['typo_prefix']))+')?\s?'
 				else:
 					item['typo_prefix']	= r''+(item['typo_prefix'])
 			if isinstance(item['prefix'],list):
@@ -142,14 +144,14 @@ def _generate(self, item):
 		else:
 			if 'typo_affix' not in item:
 				if isinstance(item['affix'],list):
-					typo_affix				= ['(?:'+self._scramble(lara.nlp.trim(lara.nlp.strip_accents(lara.nlp.remove_double_letters(elem))), (item['wordclass'] == 'adjective'))+')' for elem in item['affix']]
-					item['typo_affix']	= r'(?:'+('|'.join(typo_affix))+r')?'
+					typo_affix			= ['(?:'+self._scramble(lara.nlp.trim(lara.nlp.strip_accents(lara.nlp.remove_double_letters(elem))), (item['wordclass'] == 'adjective'))+')' for elem in item['affix']]
+					item['typo_affix']	= r'\s?(?:'+('|'.join(typo_affix))+r')?'
 				else:
 					item['typo_affix']	= r''+lara.nlp.strip_accents(item['affix'])
 			else:
 				if isinstance(item['typo_affix'],list):
 					item['typo_affix']	=  [re.escape(affix) for affix in item['typo_affix']]
-					item['typo_affix']	= r'(?:'+('|'.join(item['typo_affix']))+')?'
+					item['typo_affix']	= r'\s?(?:'+('|'.join(item['typo_affix']))+')?'
 				else:
 					item['typo_affix']	= r''+(item['typo_affix'])
 			if isinstance(item['affix'],list):
@@ -281,10 +283,17 @@ def match_best(self, text, n=1):
 		return {}
 		
 	# Get best match based on preference hierarchy
-	def match_order(self,text,preference=[]):
+	def match_order(self,text,preference=[],reverse=False):
 		if text:
 			score	= self.match(text)
 			if score:
+				if reverse:
+					if max(score, key=score.get) not in preference:
+						return max(score, key=score.get)
+					for item in score:
+						if item not in preference:
+							return item
+					preference.reverse()
 				for item in preference:
 					if item in score:
 						return item