fixed a bug where calling parser Extract() timestamps() multiple times would return empty list

sedthh · sedthh · commit 31a77922d192 · 2018-06-12T16:08:32.000+02:00
- fixed a bug where parser Extract() timestamps() multiple times would return an empty list (due to caching of regex finditer functions)
- updated entities based on user feedback
- added "Budapest" to inverse stemmer as an exception
- increased version number to 1.1.12
- udpated README.md
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ alma_intents	= {
 	"szed"		: [{"stem":"szed","wordclass":"verb"}],
 	"piros"		: [{"stem":"piros","wordclass":"adjective"}]
 }
-alma_test		= parser.Intents(alma_intents)
+alma_test	= parser.Intents(alma_intents)
 print(alma_test.match("Mikor szedjük le a pirosabb almákat?"))
 
 >>> {'alma': 1, 'szed': 2, 'piros': 2}
@@ -167,7 +167,7 @@ query	= "Toto - Afrika"
 parts	= query.split('-')
 artist	= stemmer.inverse(parts[0],'től')	# "tól" and "től" are both valid
 title	= stemmer.inverse(parts[1],'t')
-the	= ('az' if nlp.vowel_beginning(title) else 'a')
+the	= nlp.az(title)
 	
 print('A zenelejátszó program az alábbi számot játssza:')
 print(artist,the,title)
diff --git a/lara/__init__.py b/lara/__init__.py
@@ -3,7 +3,7 @@
 # Lara - Lingusitic Aim Recognizer API
 
 __all__				= 'nlp','parser','stemmer','entities'
-__version__ 		= '1.1.11'
+__version__ 		= '1.1.12'
 __version_info__	= tuple(int(num) for num in __version__.split('.'))
 
 import sys
diff --git a/lara/entities.py b/lara/entities.py
@@ -27,7 +27,7 @@ def common():
 # menu commands
 def commands():
 	return {
-		"ok"			: [{"stem":"ye","affix":["s","ah","p"]},{"stem":"igen"},{"stem":"aha"},{"stem":"ja","affix":["ja","h"]},{"stem":"ok","affix":["é","s","és","sa","ay","ézd","ézza"],"exc":[{"stem":"nem"}]},{"stem":"úgy","exc":[{"stem":"nem"}]},{"stem":"így","exc":[{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"}]},{"stem":"jó","wordclass":"adjective","max_words":4,"exc":[{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"},{"stem":"(nap\w+|reg+elt|est[eé]t)","wordclass":"regex"}]}],
+		"ok"			: [{"stem":"[jy]+e+a*[hps]*","wordclass":"regex"},{"stem":"igen"},{"stem":"aha"},{"stem":"ja","affix":["ja","h"]},{"stem":"ok","affix":["é","s","és","sa","ay","ézd","ézza"],"exc":[{"stem":"nem"}]},{"stem":"úgy","exc":[{"stem":"nem"}]},{"stem":"így","exc":[{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"}]},{"stem":"jó","wordclass":"adjective","max_words":4,"exc":[{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"},{"stem":"(nap\w+|reg+elt|est[eé]t)","wordclass":"regex"}]}],
 		"cancel"		: [{"stem":"^([ae]z\s)?(\w+\s)?(nem?|no(pe|ne)?)(\s\w+)?(\s\w+)?$","boundary":False,"exc":[{"stem":"jó"},{"stem":"tud","wordclass":"verb"},{"stem":"sikerül","affix":["t"]},{"stem":"haragudj","affix":["on"]},{"stem":"tud","wordclass":"verb"}],"wordclass":"regex"},{"stem":"cancel"},{"stem":"mégse","affix":["m"],"max_words":4},{"stem":"elvetés"},{"stem":"ves[ds]e?\sel","wordclass":"regex"}],
 		"next"			: [{"stem":"next"},{"stem":"másikat","max_words":5},{"stem":"tovább","max_words":5,"exc":[{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"}]},{"stem":"előre","max_words":5,"exc":[{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"}]},{"stem":"még","max_words":4,"exc":[{"stem":"\w+[ad]\smeg","wordclass":"regex"},{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"},{"stem":"egy"},{"stem":"1"},{"stem":"hang\w*","wordclass":"regex"}]},{"stem":"more"},{"stem":"continue"},{"stem":"folyta[st]+(a|[ao]?[dn]|ni|[aá]s)?","wordclass":"regex","exc":[{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"}]},{"stem":"következő","affix":["t","re"]},{"stem":"mond+([hj]a[dt]?)?(od|ja)?","wordclass":"regex","exc":[{"stem":"(vala)?[km]i([jlnv]?y?[eé])?[klrt]?","wordclass":"regex"},{"stem":"nem"},{"stem":"csak"},{"stem":"hogy(an)?\smond\w*","wordclass":"regex"}]}],
 		"back"			: [{"stem":"back"},{"stem":"vissza","max_words":5,"affix":["lép","lépés"],"exc":[{"stem":"hang\w*","wordclass":"regex"}]},{"stem":"hátra","max_words":4},{"stem":"előző","wordclass":"noun","max_words":5}],
@@ -90,7 +90,7 @@ def dow():
 		"holnap"		: [{"stem":"holnap(ig?|ra|pal|t[oó]l)?","wordclass":"regex","exc": [{"stem":"holnap\s?ut[aá]n(ig?|ra|nal|t[oó]l)?","wordclass":"regex"}]}],
 		"holnaputan"	: [{"stem":"holnap\s?ut[aá]n(ig?|ra|nal|t[oó]l)?","wordclass":"regex"}],
 		"tegnap"		: [{"stem":"tegnap(ig?|ra|pal|t[oó]l)?","wordclass":"regex","exc":[{"stem":"tegnap\sel[oő]t+?(ig?|re|t?el|t?[oó]l)?","wordclass":"regex"}]}],
-		"tegnapelott"	: [{"stem":"tegnap\sel[oő]t+(ig?|re|t?el|t?[oó]l)?","wordclass":"regex"}],
+		"tegnapelott"	: [{"stem":"tegnap\s?el[oő]t+(ig?|re|t?el|t?[oóöő]l)?","wordclass":"regex"}],
 		"hetfo"			: [{"stem":"hétfő","wordclass":"noun"}],
 		"kedd"			: [{"stem":"kedd","wordclass":"noun"}],
 		"szerda"		: [{"stem":"szerda","wordclass":"noun"}],
@@ -122,7 +122,7 @@ def smalltalk():
 		"about_look"	: [{"stem":"hogy(an)?\s(n[eé]zn?[eé]l\ski|mutatsz|festesz)","wordclass":"regex"},{"stem":"(k[uü]ldj|mutas+).+?(k[eé]pet|fot[oó]t|sz?elfie?t)\smagadr[oó]l","wordclass":"regex"},{"stem":"(k[uü]ldj|mutas+)\smagadr[oó]l.+?(k[eé]pet|fot[oó]t|sz?elfie?t)","wordclass":"regex"},{"stem":"(van|milyen)\s(az?\s)?(arcod|kin[eé]zeted)","wordclass":"regex"},{"stem":"szép vagy"}],
 		"about_age"		: [{"stem":"mennyi idős vagy"},{"stem":"hány éves vagy"},{"stem":"melyik évben születtél"},{"stem":"mikor születtél"},{"stem":"(melyik\s[eé]vben|mikor)\sk[eé]sz([uü]lt[eé]l|[ií]tet+ek)","wordclass":"regex"},{"stem":"(h[aá]ny(adik|ban)|mikor\s(van|[uü]n+epled)\s?a?)\ssz[uü]l(et[eé]s|i)napod(at)?","wordclass":"regex"},{"stem":"h[aá]ny\s[eé]vesnek\s.+?\smagad(at)?","wordclass":"regex"},{"stem":"sz[uü]l(et[eé]s)?i?napod(at)?\s(h[aá]nyadik[aá]n|mikor|melyik)","wordclass":"regex"}],
 		"about_zodiac"	: [{"stem":"(neked\s)?mi\sa\s(horoszk[oó]pod|csil+agjegyed)","wordclass":"regex"},{"stem":"milyen jegyben születtél"},{"stem":"a\s(te\s)?(horoszk[oó]pod|csil+agjegyed)\smi(csoda)?","wordclass":"regex"},{"stem":"milyen\sjegyben\ssz[uü]let+\w+","wordclass":"regex"}],
-		"about_location": [{"stem":"(hol|helyen)\s(k[eé]sz[uü]lt[eé]l|k[eé]sz[ií]tet+ek|sz[uü]let+[eé]l|(hoztak|j[oö]t+[eé]l).+?l[eé]tre)","wordclass":"regex"},{"stem":"hon+an\s(sz[aá]rmazol|[ií]rsz|val[oó]\svagy)","wordclass":"regex"},{"stem":"ho(n+an|l)\svagy\s(most\s)?(helyileg|most|pontosan)","wordclass":"regex"},{"stem":"(hol\s|mer+e\s)(laksz|(van|az?).+?ot+honod)","wordclass":"regex"}],
+		"about_location": [{"stem":"(hol|helyen)\s(k[eé]sz[uü]lt[eé]l|k[eé]sz[ií]tet+ek|sz[uü]let+[eé]l|(hoztak|j[oö]t+[eé]l).+?l[eé]tre)","wordclass":"regex"},{"stem":"hon+an\s(sz[aá]rmazol|[ií]rsz|val[oó]\svagy)","wordclass":"regex"},{"stem":"ho(n+an|l)\svagy\s(most\s)?(helyileg|most|pontosan)","wordclass":"regex"},{"stem":"(hol\s|mer+e\s)(laksz|(van|az?).+?ot+honod)","wordclass":"regex"},{"stem":"hol vagy","max_words":3}],
 		"about_family"	: [{"stem":"ki(k|t|ket)?\s(az?\s|tartasz\sa\s)?(te\s)?(csal[aá]dod(nak)?|sz[uü]l(t|et+[eé]l)|sz[uü]leid(nek)?|([eé]des)?(any(uk)?[aá]d|ap(uk)?[aá]d)(nak)?)","wordclass":"regex"},{"stem":"csal[aá]dban\s([eé]l(sz|tek)|sz[uü]let+[eé]l)","wordclass":"regex"},{"stem":"(h[aá]ny|van(nak)?)\stestv[eé]rei?d","wordclass":"regex"},{"stem":"(kik?|van(n?ak)?[\-\s]?e?)(\sa)?(\shoz+[aá]d?\s?tartoz[oó]i?d|csal[aá]dod)","wordclass":"regex"}],
 		"about_software": [{"stem":"(hogy(hogy|an)?|mit[oöő]l).+?(m[uüű]k[oö]dsz|(tudsz |vagy k[eé]pes )?(meg)?[eé]rte(sz|d|ni)\,? (meg )?(hogy )?(a?mit mond(ok|tam)|a?mit [ií]r(ok|tam)|engem))","wordclass":"regex"},{"stem":"mi(jen|lyen|en|\s?f[eé]le|\s?fajta)\sfekete\s?m[aá]gia","wordclass":"regex"},{"stem":"neur[aá]lis\sh[aá]l[oó]\w*","wordclass":"regex","inc":[{"stem":"vagy"},{"stem":"te"},{"stem":"működ","wordclass":"verb"}]}],
 		"about_skills"	: [{"stem":"mi(lyen|(ke)?t|k?re)\s(funkci[oó](id?|kat)\s|dolgok(at|ra)\s|tr[uü]k+([oö]k(et|re)|jeid?)\s|parancsok(at|ra)\s)?(tud(sz|n[aá]l)?\s(csin[aá]lni|mutatni)?|ismer(sz)?|(vagy\s|van\s)?(k[eé]pes|(be|meg)?tan[ií]tva)|tan[ií]tot+[aá]k\s(be|neked|meg)?|(k[eé]pes+[eé]gei?d?|tulajdons[aá]g(o|ai)d?)\svan(nak)?)","wordclass":"regex","exc":[{"stem":"mond","wordclass":"verb"}]},{"stem":"mihez ért","affix":["esz"]},{"stem":"mi((ke)?t|k?r[oöő]l)\s(lehet\s|szabad\s|tudok\s)?k[eé]rdez+h?e\w+","wordclass":"regex"}],
@@ -135,7 +135,7 @@ def smalltalk():
 		"are_you_busy"	: [{"stem":"elfoglalt","inc":[{"stem":"vagy"}]},{"stem":"r[aá]m?\s?[eé]r(n[eé]l|sz)(\smost)?(\segy)?(\skicsit|\skis\s\w+|\svalamen+yi\w*)?","wordclass":"regex"},{"stem":"(van|volna)\s(most\s)?(r[aá]m?\s)?(most\s)?(egy\s)?(kis\s|kev[eé]s\s|valamen+yi\s)?(szabad\s?)?id[oöő]d(\sr[aá]m)?","wordclass":"regex"},{"stem":"sok dolgod van"}],
 		"are_you_lying"	: [{"stem":"hazud","wordclass":"verb"},{"stem":"nem mondt[aá][dl]\s((el|meg)\saz\s)?igaz(at|s[aá]got)","wordclass":"regex"}],
 		"are_you_serious": [{"stem":"(nem?|csak)\s(vic+el(sz|j)?|mond+(od|ja)?|ideges[ií]ts(en)?)","wordclass":"regex"},{"stem":"(komolyan|t[eé]nyleg)\s?([uúií]gy\s|azt\s)?((mond|gondol|[ií]r)(ja|od|tad?)|hisz(i|ed)|hit+ed?)","wordclass":"regex"},{"stem":"biztos(an)?\s(vagy\s)?(\w+\s)?(ben+e|eb+en|mond(ta|o)d|mond[jt]a)","wordclass":"regex"},{"stem":"ezt?\s(most\s)?komoly(an)?","wordclass":"regex"}],
-		"can_you_hear_me": [{"stem":"(olvas+a|hal+ja|n[eé]zi|van\sit+)(\sezt)?\s(vala|b[aá]r)ki(\sis)?","wordclass":"regex"},{"stem":"(hal+(asz|od)|l[aá]t(sz|od)|vesze[ld])\s(engem|a?mit\s(mondok|[ií]rok|k[eé]rdezek))","wordclass":"regex"},{"stem":"valaki\s(hal+(ja)?\s|olvas+a|figyeli?(\sar+a)?)\sa?mit\s(ide\s?|it+\s)?([ií]rok|mondok|k[eé]rdezek)","wordclass":"regex"},{"stem":"felfogtad","max_words":3},{"stem":"itt","inc":[{"stem":"vagy"},{"stem":"van"}],"max_words":3}],
+		"can_you_hear_me": [{"stem":"(olvas+a|hal+ja|n[eé]zi|van\sit+)(\sezt)?\s(vala|b[aá]r)ki(\sis)?","wordclass":"regex"},{"stem":"(hal+(asz|od)|l[aá]t(sz|od)|vesze[ld])\s(engem|a?mit\s(mondok|[ií]rok|k[eé]rdezek))","wordclass":"regex"},{"stem":"valaki\s(hal+(ja)?\s|olvas+a|figyeli?(\sar+a)?)\sa?mit\s(ide\s?|it+\s)?([ií]rok|mondok|k[eé]rdezek)","wordclass":"regex"},{"stem":"felfogtad","max_words":3},{"stem":"itt","inc":[{"stem":"vagy"},{"stem":"van"}],"max_words":3},{"stem":"halló","max_words":3}],
 		"can_you_learn": [{"stem":"(k[eé]pes(\svagy)?|tud(sz)?)\stanulni","wordclass":"regex"},{"stem":"tanulsz\s(is|[ae].+?b[oóöő]l)","wordclass":"regex"},{"stem":"[dln][aáeéo][km]\s(be|meg)?tan[ií]tani\b","wordclass":"regex","boundary":False}],
 		"can_you_understand_me":[{"stem":"(meg)?[eé]rt(e(d|sz|t+ed?)|i)\,?((\shogy)?\sa?mit\s([ií]r|mond)\w+|\smagyarul)","wordclass":"regex"}],
 		"contact"		: [{"stem":"mi(lyen)?\s(.+?\s)?(e\-?mail\s?)?c[ií]me[dn]?","wordclass":"regex"},{"stem":"elérhetőség","wordclass":"noun"},{"stem":"elér","wordclass":"verb","inc":[{"stem":"önt"},{"stem":"téged"}]}],
diff --git a/lara/parser.py b/lara/parser.py
@@ -1045,7 +1045,7 @@ def relative_dates(self,normalize=True,current=False):
 				results	=	[]
 				for item in matches:
 					item	= item.lower()
-					now	= _now
+					now		= _now
 					if 'holnap' in item:
 						if 'ut' in item:
 							now			+= datetime.timedelta(days = 2)
@@ -1084,27 +1084,30 @@ def relative_dates(self,normalize=True,current=False):
 	def timestamps(self,current=False):
 		# testing environment
 		c_relative		= False
-		c_times		= False
+		c_times			= False
 		if current:
-			c_relative	= current.split()[0]
-			now			= c_relative
-			c_times		= int((current.split()[1]).split(':')[0])
+			c_relative		= current.split()[0]
+			now				= c_relative
+			c_times			= int((current.split()[1]).split(':')[0])
 		else:
-			now			=	datetime.datetime.now().strftime('%Y-%m-%d')
+			now				= datetime.datetime.now().strftime('%Y-%m-%d')
 		dates			= self.dates(False)
 		relative		= self.relative_dates(False,c_relative)
 		times			= self.times(False,True,c_times)
 		dates_pos		= []
 		relative_pos	= []
 		times_pos		= []
 		for item in dates:
-			for match in _re.finditer(r'\b'+re.escape(item), re.IGNORECASE, self.ntext):
+			regex	= re.compile(r'\b'+re.escape(item), re.IGNORECASE)
+			for match in regex.finditer(self.ntext):
 				dates_pos.append(match.span()[0])
 		for item in relative:
-			for match in _re.finditer(r'\b'+re.escape(item), re.IGNORECASE, self.ntext):
+			regex	= re.compile(r'\b'+re.escape(item), re.IGNORECASE)
+			for match in regex.finditer(self.ntext):
 				relative_pos.append(match.span()[0])
 		for item in times:
-			for match in _re.finditer(r'\b'+re.escape(item), re.IGNORECASE, self.ntext):
+			regex	= re.compile(r'\b'+re.escape(item), re.IGNORECASE)
+			for match in regex.finditer(self.ntext):
 				times_pos.append(match.span()[0])
 		dates_pos.append(-1)
 		relative_pos.append(-1)
@@ -1308,7 +1311,8 @@ def findall(expression,flags,text):
 			_re.findall_cache[flags_str][t_hash][e_hash]	= _re.compile_cache[flags_str][e_hash].findall(text)
 		return _re.findall_cache[flags_str][t_hash][e_hash]
 	
-	# cache re.compile().finditer() outputs
+	# for some unknown reason this does not work as intended
+	'''
 	def finditer(expression,flags,text):
 		e_hash			= hashlib.sha1(str(expression).encode("utf-8")).hexdigest()
 		_re.compile(e_hash,expression,flags)
@@ -1321,6 +1325,7 @@ def finditer(expression,flags,text):
 		if e_hash not in _re.finditer_cache[flags_str][t_hash]:
 			_re.finditer_cache[flags_str][t_hash][e_hash]	= _re.compile_cache[flags_str][e_hash].finditer(text)
 		return _re.finditer_cache[flags_str][t_hash][e_hash]
+	'''
 	
 	# user cached re.compile() for re.sub()
 	def sub(expression,flags,repl,string):
diff --git a/lara/stemmer.py b/lara/stemmer.py
@@ -347,6 +347,8 @@ def inverse(word,affix):
 			return 'arra'
 		if word=='ez':
 			return 'erre'
+		if word.lower()=='budapest':
+			return result+'re'
 		if word[-1].lower() in ('a','e'):
 			result	= result[:-1]+result[-1].replace('a','á').replace('e','é')
 		if vh == 'magas':
@@ -494,7 +496,9 @@ def inverse(word,affix):
 				return "tavon"
 			elif word.lower()=="ló":
 				return "lovon"
-		if word.lower()=="pécs":
+		if word.lower()=="budapest":
+			return "budapesten"
+		elif word.lower()=="pécs":
 			return "pécsett"
 		elif word.lower()=="győr":
 			return "győrött"