Skip to content

Commit 31a7792

Browse files
committed
fixed a bug where calling parser Extract() timestamps() multiple times would return empty list
- fixed a bug where parser Extract() timestamps() multiple times would return an empty list (due to caching of regex finditer functions) - updated entities based on user feedback - added "Budapest" to inverse stemmer as an exception - increased version number to 1.1.12 - udpated README.md
1 parent 80b6b85 commit 31a7792

File tree

5 files changed

+27
-18
lines changed

5 files changed

+27
-18
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ alma_intents = {
4848
"szed" : [{"stem":"szed","wordclass":"verb"}],
4949
"piros" : [{"stem":"piros","wordclass":"adjective"}]
5050
}
51-
alma_test = parser.Intents(alma_intents)
51+
alma_test = parser.Intents(alma_intents)
5252
print(alma_test.match("Mikor szedjük le a pirosabb almákat?"))
5353

5454
>>> {'alma': 1, 'szed': 2, 'piros': 2}
@@ -167,7 +167,7 @@ query = "Toto - Afrika"
167167
parts = query.split('-')
168168
artist = stemmer.inverse(parts[0],'től') # "tól" and "től" are both valid
169169
title = stemmer.inverse(parts[1],'t')
170-
the = ('az' if nlp.vowel_beginning(title) else 'a')
170+
the = nlp.az(title)
171171

172172
print('A zenelejátszó program az alábbi számot játssza:')
173173
print(artist,the,title)

lara/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Lara - Lingusitic Aim Recognizer API
44

55
__all__ = 'nlp','parser','stemmer','entities'
6-
__version__ = '1.1.11'
6+
__version__ = '1.1.12'
77
__version_info__ = tuple(int(num) for num in __version__.split('.'))
88

99
import sys

lara/entities.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def common():
2727
# menu commands
2828
def commands():
2929
return {
30-
"ok" : [{"stem":"ye","affix":["s","ah","p"]},{"stem":"igen"},{"stem":"aha"},{"stem":"ja","affix":["ja","h"]},{"stem":"ok","affix":["é","s","és","sa","ay","ézd","ézza"],"exc":[{"stem":"nem"}]},{"stem":"úgy","exc":[{"stem":"nem"}]},{"stem":"így","exc":[{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"}]},{"stem":"jó","wordclass":"adjective","max_words":4,"exc":[{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"},{"stem":"(nap\w+|reg+elt|est[eé]t)","wordclass":"regex"}]}],
30+
"ok" : [{"stem":"[jy]+e+a*[hps]*","wordclass":"regex"},{"stem":"igen"},{"stem":"aha"},{"stem":"ja","affix":["ja","h"]},{"stem":"ok","affix":["é","s","és","sa","ay","ézd","ézza"],"exc":[{"stem":"nem"}]},{"stem":"úgy","exc":[{"stem":"nem"}]},{"stem":"így","exc":[{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"}]},{"stem":"jó","wordclass":"adjective","max_words":4,"exc":[{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"},{"stem":"(nap\w+|reg+elt|est[eé]t)","wordclass":"regex"}]}],
3131
"cancel" : [{"stem":"^([ae]z\s)?(\w+\s)?(nem?|no(pe|ne)?)(\s\w+)?(\s\w+)?$","boundary":False,"exc":[{"stem":"jó"},{"stem":"tud","wordclass":"verb"},{"stem":"sikerül","affix":["t"]},{"stem":"haragudj","affix":["on"]},{"stem":"tud","wordclass":"verb"}],"wordclass":"regex"},{"stem":"cancel"},{"stem":"mégse","affix":["m"],"max_words":4},{"stem":"elvetés"},{"stem":"ves[ds]e?\sel","wordclass":"regex"}],
3232
"next" : [{"stem":"next"},{"stem":"másikat","max_words":5},{"stem":"tovább","max_words":5,"exc":[{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"}]},{"stem":"előre","max_words":5,"exc":[{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"}]},{"stem":"még","max_words":4,"exc":[{"stem":"\w+[ad]\smeg","wordclass":"regex"},{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"},{"stem":"egy"},{"stem":"1"},{"stem":"hang\w*","wordclass":"regex"}]},{"stem":"more"},{"stem":"continue"},{"stem":"folyta[st]+(a|[ao]?[dn]|ni|[aá]s)?","wordclass":"regex","exc":[{"stem":"((m[eé]g)?[ns]em*i?|baj)","wordclass":"regex"}]},{"stem":"következő","affix":["t","re"]},{"stem":"mond+([hj]a[dt]?)?(od|ja)?","wordclass":"regex","exc":[{"stem":"(vala)?[km]i([jlnv]?y?[eé])?[klrt]?","wordclass":"regex"},{"stem":"nem"},{"stem":"csak"},{"stem":"hogy(an)?\smond\w*","wordclass":"regex"}]}],
3333
"back" : [{"stem":"back"},{"stem":"vissza","max_words":5,"affix":["lép","lépés"],"exc":[{"stem":"hang\w*","wordclass":"regex"}]},{"stem":"hátra","max_words":4},{"stem":"előző","wordclass":"noun","max_words":5}],
@@ -90,7 +90,7 @@ def dow():
9090
"holnap" : [{"stem":"holnap(ig?|ra|pal|t[oó]l)?","wordclass":"regex","exc": [{"stem":"holnap\s?ut[aá]n(ig?|ra|nal|t[oó]l)?","wordclass":"regex"}]}],
9191
"holnaputan" : [{"stem":"holnap\s?ut[aá]n(ig?|ra|nal|t[oó]l)?","wordclass":"regex"}],
9292
"tegnap" : [{"stem":"tegnap(ig?|ra|pal|t[oó]l)?","wordclass":"regex","exc":[{"stem":"tegnap\sel[oő]t+?(ig?|re|t?el|t?[oó]l)?","wordclass":"regex"}]}],
93-
"tegnapelott" : [{"stem":"tegnap\sel[oő]t+(ig?|re|t?el|t?[]l)?","wordclass":"regex"}],
93+
"tegnapelott" : [{"stem":"tegnap\s?el[oő]t+(ig?|re|t?el|t?[oóöő]l)?","wordclass":"regex"}],
9494
"hetfo" : [{"stem":"hétfő","wordclass":"noun"}],
9595
"kedd" : [{"stem":"kedd","wordclass":"noun"}],
9696
"szerda" : [{"stem":"szerda","wordclass":"noun"}],
@@ -122,7 +122,7 @@ def smalltalk():
122122
"about_look" : [{"stem":"hogy(an)?\s(n[eé]zn?[eé]l\ski|mutatsz|festesz)","wordclass":"regex"},{"stem":"(k[uü]ldj|mutas+).+?(k[eé]pet|fot[oó]t|sz?elfie?t)\smagadr[oó]l","wordclass":"regex"},{"stem":"(k[uü]ldj|mutas+)\smagadr[oó]l.+?(k[eé]pet|fot[oó]t|sz?elfie?t)","wordclass":"regex"},{"stem":"(van|milyen)\s(az?\s)?(arcod|kin[eé]zeted)","wordclass":"regex"},{"stem":"szép vagy"}],
123123
"about_age" : [{"stem":"mennyi idős vagy"},{"stem":"hány éves vagy"},{"stem":"melyik évben születtél"},{"stem":"mikor születtél"},{"stem":"(melyik\s[eé]vben|mikor)\sk[eé]sz([uü]lt[eé]l|[ií]tet+ek)","wordclass":"regex"},{"stem":"(h[aá]ny(adik|ban)|mikor\s(van|[uü]n+epled)\s?a?)\ssz[uü]l(et[eé]s|i)napod(at)?","wordclass":"regex"},{"stem":"h[aá]ny\s[eé]vesnek\s.+?\smagad(at)?","wordclass":"regex"},{"stem":"sz[uü]l(et[eé]s)?i?napod(at)?\s(h[aá]nyadik[aá]n|mikor|melyik)","wordclass":"regex"}],
124124
"about_zodiac" : [{"stem":"(neked\s)?mi\sa\s(horoszk[oó]pod|csil+agjegyed)","wordclass":"regex"},{"stem":"milyen jegyben születtél"},{"stem":"a\s(te\s)?(horoszk[oó]pod|csil+agjegyed)\smi(csoda)?","wordclass":"regex"},{"stem":"milyen\sjegyben\ssz[uü]let+\w+","wordclass":"regex"}],
125-
"about_location": [{"stem":"(hol|helyen)\s(k[eé]sz[uü]lt[eé]l|k[eé]sz[ií]tet+ek|sz[uü]let+[eé]l|(hoztak|j[oö]t+[eé]l).+?l[eé]tre)","wordclass":"regex"},{"stem":"hon+an\s(sz[aá]rmazol|[ií]rsz|val[oó]\svagy)","wordclass":"regex"},{"stem":"ho(n+an|l)\svagy\s(most\s)?(helyileg|most|pontosan)","wordclass":"regex"},{"stem":"(hol\s|mer+e\s)(laksz|(van|az?).+?ot+honod)","wordclass":"regex"}],
125+
"about_location": [{"stem":"(hol|helyen)\s(k[eé]sz[uü]lt[eé]l|k[eé]sz[ií]tet+ek|sz[uü]let+[eé]l|(hoztak|j[oö]t+[eé]l).+?l[eé]tre)","wordclass":"regex"},{"stem":"hon+an\s(sz[aá]rmazol|[ií]rsz|val[oó]\svagy)","wordclass":"regex"},{"stem":"ho(n+an|l)\svagy\s(most\s)?(helyileg|most|pontosan)","wordclass":"regex"},{"stem":"(hol\s|mer+e\s)(laksz|(van|az?).+?ot+honod)","wordclass":"regex"},{"stem":"hol vagy","max_words":3}],
126126
"about_family" : [{"stem":"ki(k|t|ket)?\s(az?\s|tartasz\sa\s)?(te\s)?(csal[aá]dod(nak)?|sz[uü]l(t|et+[eé]l)|sz[uü]leid(nek)?|([eé]des)?(any(uk)?[aá]d|ap(uk)?[aá]d)(nak)?)","wordclass":"regex"},{"stem":"csal[aá]dban\s([eé]l(sz|tek)|sz[uü]let+[eé]l)","wordclass":"regex"},{"stem":"(h[aá]ny|van(nak)?)\stestv[eé]rei?d","wordclass":"regex"},{"stem":"(kik?|van(n?ak)?[\-\s]?e?)(\sa)?(\shoz+[aá]d?\s?tartoz[oó]i?d|csal[aá]dod)","wordclass":"regex"}],
127127
"about_software": [{"stem":"(hogy(hogy|an)?|mit[oöő]l).+?(m[uüű]k[oö]dsz|(tudsz |vagy k[eé]pes )?(meg)?[eé]rte(sz|d|ni)\,? (meg )?(hogy )?(a?mit mond(ok|tam)|a?mit [ií]r(ok|tam)|engem))","wordclass":"regex"},{"stem":"mi(jen|lyen|en|\s?f[eé]le|\s?fajta)\sfekete\s?m[aá]gia","wordclass":"regex"},{"stem":"neur[aá]lis\sh[aá]l[oó]\w*","wordclass":"regex","inc":[{"stem":"vagy"},{"stem":"te"},{"stem":"működ","wordclass":"verb"}]}],
128128
"about_skills" : [{"stem":"mi(lyen|(ke)?t|k?re)\s(funkci[oó](id?|kat)\s|dolgok(at|ra)\s|tr[uü]k+([oö]k(et|re)|jeid?)\s|parancsok(at|ra)\s)?(tud(sz|n[aá]l)?\s(csin[aá]lni|mutatni)?|ismer(sz)?|(vagy\s|van\s)?(k[eé]pes|(be|meg)?tan[ií]tva)|tan[ií]tot+[aá]k\s(be|neked|meg)?|(k[eé]pes+[eé]gei?d?|tulajdons[aá]g(o|ai)d?)\svan(nak)?)","wordclass":"regex","exc":[{"stem":"mond","wordclass":"verb"}]},{"stem":"mihez ért","affix":["esz"]},{"stem":"mi((ke)?t|k?r[oöő]l)\s(lehet\s|szabad\s|tudok\s)?k[eé]rdez+h?e\w+","wordclass":"regex"}],
@@ -135,7 +135,7 @@ def smalltalk():
135135
"are_you_busy" : [{"stem":"elfoglalt","inc":[{"stem":"vagy"}]},{"stem":"r[aá]m?\s?[eé]r(n[eé]l|sz)(\smost)?(\segy)?(\skicsit|\skis\s\w+|\svalamen+yi\w*)?","wordclass":"regex"},{"stem":"(van|volna)\s(most\s)?(r[aá]m?\s)?(most\s)?(egy\s)?(kis\s|kev[eé]s\s|valamen+yi\s)?(szabad\s?)?id[oöő]d(\sr[aá]m)?","wordclass":"regex"},{"stem":"sok dolgod van"}],
136136
"are_you_lying" : [{"stem":"hazud","wordclass":"verb"},{"stem":"nem mondt[aá][dl]\s((el|meg)\saz\s)?igaz(at|s[aá]got)","wordclass":"regex"}],
137137
"are_you_serious": [{"stem":"(nem?|csak)\s(vic+el(sz|j)?|mond+(od|ja)?|ideges[ií]ts(en)?)","wordclass":"regex"},{"stem":"(komolyan|t[eé]nyleg)\s?([uúií]gy\s|azt\s)?((mond|gondol|[ií]r)(ja|od|tad?)|hisz(i|ed)|hit+ed?)","wordclass":"regex"},{"stem":"biztos(an)?\s(vagy\s)?(\w+\s)?(ben+e|eb+en|mond(ta|o)d|mond[jt]a)","wordclass":"regex"},{"stem":"ezt?\s(most\s)?komoly(an)?","wordclass":"regex"}],
138-
"can_you_hear_me": [{"stem":"(olvas+a|hal+ja|n[eé]zi|van\sit+)(\sezt)?\s(vala|b[aá]r)ki(\sis)?","wordclass":"regex"},{"stem":"(hal+(asz|od)|l[aá]t(sz|od)|vesze[ld])\s(engem|a?mit\s(mondok|[ií]rok|k[eé]rdezek))","wordclass":"regex"},{"stem":"valaki\s(hal+(ja)?\s|olvas+a|figyeli?(\sar+a)?)\sa?mit\s(ide\s?|it+\s)?([ií]rok|mondok|k[eé]rdezek)","wordclass":"regex"},{"stem":"felfogtad","max_words":3},{"stem":"itt","inc":[{"stem":"vagy"},{"stem":"van"}],"max_words":3}],
138+
"can_you_hear_me": [{"stem":"(olvas+a|hal+ja|n[eé]zi|van\sit+)(\sezt)?\s(vala|b[aá]r)ki(\sis)?","wordclass":"regex"},{"stem":"(hal+(asz|od)|l[aá]t(sz|od)|vesze[ld])\s(engem|a?mit\s(mondok|[ií]rok|k[eé]rdezek))","wordclass":"regex"},{"stem":"valaki\s(hal+(ja)?\s|olvas+a|figyeli?(\sar+a)?)\sa?mit\s(ide\s?|it+\s)?([ií]rok|mondok|k[eé]rdezek)","wordclass":"regex"},{"stem":"felfogtad","max_words":3},{"stem":"itt","inc":[{"stem":"vagy"},{"stem":"van"}],"max_words":3},{"stem":"halló","max_words":3}],
139139
"can_you_learn": [{"stem":"(k[eé]pes(\svagy)?|tud(sz)?)\stanulni","wordclass":"regex"},{"stem":"tanulsz\s(is|[ae].+?b[oóöő]l)","wordclass":"regex"},{"stem":"[dln][aáeéo][km]\s(be|meg)?tan[ií]tani\b","wordclass":"regex","boundary":False}],
140140
"can_you_understand_me":[{"stem":"(meg)?[eé]rt(e(d|sz|t+ed?)|i)\,?((\shogy)?\sa?mit\s([ií]r|mond)\w+|\smagyarul)","wordclass":"regex"}],
141141
"contact" : [{"stem":"mi(lyen)?\s(.+?\s)?(e\-?mail\s?)?c[ií]me[dn]?","wordclass":"regex"},{"stem":"elérhetőség","wordclass":"noun"},{"stem":"elér","wordclass":"verb","inc":[{"stem":"önt"},{"stem":"téged"}]}],

lara/parser.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1045,7 +1045,7 @@ def relative_dates(self,normalize=True,current=False):
10451045
results = []
10461046
for item in matches:
10471047
item = item.lower()
1048-
now = _now
1048+
now = _now
10491049
if 'holnap' in item:
10501050
if 'ut' in item:
10511051
now += datetime.timedelta(days = 2)
@@ -1084,27 +1084,30 @@ def relative_dates(self,normalize=True,current=False):
10841084
def timestamps(self,current=False):
10851085
# testing environment
10861086
c_relative = False
1087-
c_times = False
1087+
c_times = False
10881088
if current:
1089-
c_relative = current.split()[0]
1090-
now = c_relative
1091-
c_times = int((current.split()[1]).split(':')[0])
1089+
c_relative = current.split()[0]
1090+
now = c_relative
1091+
c_times = int((current.split()[1]).split(':')[0])
10921092
else:
1093-
now = datetime.datetime.now().strftime('%Y-%m-%d')
1093+
now = datetime.datetime.now().strftime('%Y-%m-%d')
10941094
dates = self.dates(False)
10951095
relative = self.relative_dates(False,c_relative)
10961096
times = self.times(False,True,c_times)
10971097
dates_pos = []
10981098
relative_pos = []
10991099
times_pos = []
11001100
for item in dates:
1101-
for match in _re.finditer(r'\b'+re.escape(item), re.IGNORECASE, self.ntext):
1101+
regex = re.compile(r'\b'+re.escape(item), re.IGNORECASE)
1102+
for match in regex.finditer(self.ntext):
11021103
dates_pos.append(match.span()[0])
11031104
for item in relative:
1104-
for match in _re.finditer(r'\b'+re.escape(item), re.IGNORECASE, self.ntext):
1105+
regex = re.compile(r'\b'+re.escape(item), re.IGNORECASE)
1106+
for match in regex.finditer(self.ntext):
11051107
relative_pos.append(match.span()[0])
11061108
for item in times:
1107-
for match in _re.finditer(r'\b'+re.escape(item), re.IGNORECASE, self.ntext):
1109+
regex = re.compile(r'\b'+re.escape(item), re.IGNORECASE)
1110+
for match in regex.finditer(self.ntext):
11081111
times_pos.append(match.span()[0])
11091112
dates_pos.append(-1)
11101113
relative_pos.append(-1)
@@ -1308,7 +1311,8 @@ def findall(expression,flags,text):
13081311
_re.findall_cache[flags_str][t_hash][e_hash] = _re.compile_cache[flags_str][e_hash].findall(text)
13091312
return _re.findall_cache[flags_str][t_hash][e_hash]
13101313

1311-
# cache re.compile().finditer() outputs
1314+
# for some unknown reason this does not work as intended
1315+
'''
13121316
def finditer(expression,flags,text):
13131317
e_hash = hashlib.sha1(str(expression).encode("utf-8")).hexdigest()
13141318
_re.compile(e_hash,expression,flags)
@@ -1321,6 +1325,7 @@ def finditer(expression,flags,text):
13211325
if e_hash not in _re.finditer_cache[flags_str][t_hash]:
13221326
_re.finditer_cache[flags_str][t_hash][e_hash] = _re.compile_cache[flags_str][e_hash].finditer(text)
13231327
return _re.finditer_cache[flags_str][t_hash][e_hash]
1328+
'''
13241329

13251330
# user cached re.compile() for re.sub()
13261331
def sub(expression,flags,repl,string):

lara/stemmer.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,8 @@ def inverse(word,affix):
347347
return 'arra'
348348
if word=='ez':
349349
return 'erre'
350+
if word.lower()=='budapest':
351+
return result+'re'
350352
if word[-1].lower() in ('a','e'):
351353
result = result[:-1]+result[-1].replace('a','á').replace('e','é')
352354
if vh == 'magas':
@@ -494,7 +496,9 @@ def inverse(word,affix):
494496
return "tavon"
495497
elif word.lower()=="ló":
496498
return "lovon"
497-
if word.lower()=="pécs":
499+
if word.lower()=="budapest":
500+
return "budapesten"
501+
elif word.lower()=="pécs":
498502
return "pécsett"
499503
elif word.lower()=="győr":
500504
return "győrött"

0 commit comments

Comments
 (0)