Skip to content

Commit 25419e4

Browse files
committed
fixed minor bug witere boundary declarations would be ignored for regex stems
- fixed a bug where \b would be interpreted as a character instead of \\b for regex boundary declarations in stems - updated entities based on user feedback - increased version number to 1.1.9
1 parent 45d8a43 commit 25419e4

File tree

3 files changed

+12
-6
lines changed

3 files changed

+12
-6
lines changed

lara/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Lara - Lingusitic Aim Recognizer API
44

55
__all__ = 'nlp','parser','stemmer','entities'
6-
__version__ = '1.1.8'
6+
__version__ = '1.1.9'
77
__version_info__ = tuple(int(num) for num in __version__.split('.'))
88

99
import sys

lara/entities.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ def common():
77
"no" : [{"stem":"n","max_words":1},{"stem":"no","max_words":3},{"stem":"nem","exc":[{"stem":"megy"},{"stem":"baj"},{"stem":"tud","wordclass":"verb"},{"stem":"ért","wordclass":"verb"}]},{"stem":"ne","exc":[{"stem":"haragudj","affix":["on"]}]},{"stem":"soha"},{"stem":"mégse","affix":["m"]},{"stem":"ros+z\s(v[aá]lasz|vic+|megold[aá]s)","wordclass":"regex"},{"stem":"nincs rendben"}],
88
"hi" : [{"stem":"ha?i+","wordclass":"regex"},{"stem":"s+z+i+[aoó](ka|sztok)?","wordclass":"regex"},{"stem":"helló","affix":["ka"]},{"stem":"szer?[bv][au]sz(tok)?","wordclass":"regex"},{"stem":"hali","affix":["hó"]},{"stem":"(sz[eé]p|j[oó])\s?(reg+el|nap|est[eé])(o?t|[eéuü]nk)","wordclass":"regex"},{"stem":"[uü]dv([oö]z[oö]?l+(e[kt])?([eoö]m)?)?","wordclass":"regex"},{"stem":"örvendek"}],
99
"bye" : [{"stem":"bye"},{"stem":"viszlát"},{"stem":"viszont látásra"},{"stem":"jó éj","affix":["t","szakát"]},{"stem":"jóccakát"},{"stem":"mennem kell"},{"stem":"csumi"},{"stem":"cs[aáoöő]+[oó]*(v[aá]z?)?","wordclass":"regex"},{"stem":"puszi"}],
10-
"thx" : [{"stem":"(ezer\s?)?(k[oö]s+z|k[oösz][oösz][oösz])(i(ke)?|ke|[oö]n[oö]m|[oö]nj[uü]k|[eoö]net(em)?|csi|ent+y[uüű])?(\s?sz[eé]pen)?","wordclass":"regex"},{"stem":"[ht][ht]x","wordclass":"regex"},{"stem":"t(ha|h?e)nks?\s?(you)?","wordclass":"regex"},{"stem":"danke"}],
10+
"thx" : [{"stem":"(ezer\s?)?(k[oö]s+z|k[oösz][oösz][oösz])(i(ke)?|ke|[oö]n[oö]m|[oö]nj[uü]k|[eoö]net(em)?|csi|ent+y[uüű])?(\s?sz[eé]pen)?","wordclass":"regex"},{"stem":"[ht][ht]x","wordclass":"regex"},{"stem":"t(ha|h?e)nks?\s?(you)?","wordclass":"regex"},{"stem":"danke"},{"stem":"neked is","max_words":3},{"stem":"magának is","max_words":3},{"stem":"önnek is","max_words":3}],
1111
"pls" : [{"stem":"p+l+[iíea]*[zs]+e*","wordclass":"regex"},{"stem":"l[eé]+[cgyt]+[sz]*[ií]+(ves|keh?)?","wordclass":"regex"},{"stem":"l[eé](sz(el)?|gy(en)?|n+[eé]l).*?(kedves|sz[ií](ves)?)","wordclass":"regex"},{"stem":"szeretn[eé](k|m)","wordclass":"regex","exc":[{"stem":"(meg)?bocs(i(ka)?|[aá](nat([aá][eé]rt)?|nat[aáo]t?|s+|s+on|j?t(ana)?))?","wordclass":"regex"},{"stem":"elnézés","wordclass":"noun","match_stem":False}]},{"stem":"(meg)?k[eé]r(het)?((n[eéi])?l?e?[km]?)","wordclass":"regex","exc":[{"stem":"(meg)?bocs(i(ka)?|[aá](nat([aá][eé]rt)?|nat[aáo]t?|s+|s+on|j?t(ana)?))?","wordclass":"regex"},{"stem":"elnézés","wordclass":"noun","match_stem":False}]},{"stem":"szeretn[eé]([km]|lek)","wordclass":"regex","exc":[{"stem":"(meg)?bocs(i(ka)?|[aá](nat([aá][eé]rt)?|nat[aáo]t?|s+|s+on|j?t(ana)?))?","wordclass":"regex"},{"stem":"elnézés","wordclass":"noun","match_stem":False}]}],
12-
"welks" : [{"stem":"nincs mit"},{"stem":"(nagyon\s?)?(is\s)?sz[ií]ves(en|\s?[oö]r[oö]mest)","wordclass":"regex"},{"stem":"ugyan\,?\shag[gy]\w{1,3}","wordclass":"regex"},{"stem":"hag[gy]\w{1,3}\scsak","wordclass":"regex"},{"stem":"sz[aá]momra.+?([oö]r[oö]m|megtiszteltet[eé]s)","wordclass":"regex"}],
12+
"welks" : [{"stem":"nincs mit"},{"stem":"(nagyon\s?)?(is\s)?sz[ií]ves(en|\s?[oö]r[oö]mest)","wordclass":"regex"},{"stem":"ugyan\,?\shag[gy]\w{1,3}","wordclass":"regex"},{"stem":"hag[gy]\w{1,3}\scsak","wordclass":"regex"},{"stem":"sz[aá]momra.+?([oö]r[oö]m|megtiszteltet[eé]s)","wordclass":"regex"},{"stem":"gyors volt"}],
1313
"sorry" : [{"stem":"(meg)?bocs(i(ka)?|esz|[aá](nat([aá][eé]rt)?|nat[aáo]t?|s+|s+on|j?t(ana)?))?","wordclass":"regex"},{"stem":"elnézés","wordclass":"noun","match_stem":False},{"stem":"sajn[aá]l(om|juk)","wordclass":"regex"},{"stem":"s+z*o*r+[iy]+(ka)?","wordclass":"regex"}],
1414
"lol" : [{"stem":"(h[aei]){2,}h?","wordclass":"regex"},{"stem":"o?(lol)+o?","wordclass":"regex"},{"stem":"[\:\;]\-*[dp\)9]+","wordclass":"regex","boundary":False},{"stem":"[\(8]+\-*[:;]","wordclass":"regex","boundary":False},{"stem":"rot?fl","wordclass":"regex"},{"stem":"vicces","exc":[{"stem":"nem"}]},{"stem":"nevet(tem|ek|[uü]nk)","wordclass":"regex","exc":[{"stem":"nem"}]}],
1515
"nvm" : [{"stem":"felejts[ed]n?\sel","wordclass":"regex"},{"stem":"mindegy","exc":[{"stem":"hogy"},{"stem":"nem"}]},{"stem":"nem fontos"},{"stem":"hagy(jad?|d)","wordclass":"regex","inc":[{"stem":"jól","affix":["van"]},{"stem":"á"},{"stem":"mindegy"},{"stem":"inkább"}]},{"stem":"ne\s(is\s)?(foglalkoz+(on|[aá]l)?|t[oö]r[oöő]dj([oö]n|[eé]l)?)\s(vel(e|[uü]k)|[ae][vz]+[ae]l)","wordclass":"regex"},{"stem":"hagy\w+\sfigyelmen\sk[ií]v[uü]l","wordclass":"regex","exc":[{"stem":"ne"}]}],
@@ -21,7 +21,7 @@ def common():
2121
"profanity" : [{"stem":"(fel|le|meg|r[aá]|ki|be|oda|[oö]s+ze|bele|hoz+[aá])?bas*z+d?(at)?(hat)?\s?(us|a[dk]?|n?[aá][kl]|[aá]?t[aáo][lkm]?|ot+|ni|n[aá]n?[dlkm]?|va|meg|ki)?","wordclass":"regex","exc":[{"stem":"megye"}]},{"stem":"fasz","prefix":["ló","agy"],"wordclass":"noun"},{"stem":"fasza","wordclass":"adjective"},{"stem":"geci","wordclass":"noun"},{"stem":"kurva","affix":["élet","anya","anyja","annya"],"wordclass":"noun"},{"stem":"hülye","wordclass":"adjective"},{"stem":"pi(n|cs)[aá][dk]?(a?t|nak|ban?|[bt][oó]l|[eé]rt)?","wordclass":"regex"},{"stem":"((bekap(ja?|hato?|n[aái])?d?)|(kap.*?be))","wordclass":"regex"},{"stem":"(le)?szop(sz|ol|[jn][aá][dl]|hat(sz|n[aá]l|o[dl]))(\s?(le|ki))?","wordclass":"regex"},{"stem":"(geci|kurva)?(fos|szar)\w{0,3}","wordclass":"regex"}],
2222
"welldone" : [{"stem":"fasza"},{"stem":"nagyszerű"},{"stem":"remek","max_words":5},{"stem":"jó","prefix":["kurva"],"exc":[{"stem":"nincs"},{"stem":"nem"},{"stem":"éjt"},{"stem":"reggelt"},{"stem":"napot"},{"stem":"estét"},{"stem":"éjszakát"}]},{"stem":"j[oó]l\s?van","wordclass":"regex"},{"stem":"király"},{"stem":"ügyes"},{"stem":"(sz[eé]p\s(volt|munka))|(ez\s(lesz\s)?az)|(sz?uper)|zs[ií]r","wordclass":"regex"},{"stem":"👍","wordclass":"emoji"},{"stem":"\(Y\)","wordclass":"regex","boundary":False},{"stem":"profi vagy"},{"stem":"fant[aoö](rp|sz?t)i[ck](us)?(an)?","wordclass":"regex"},{"stem":"szeretem","inc":[{"stem":"amikor"},{"stem":"ahogy"}],"exc":[{"stem":"nem"}]}],
2323
"dontknow" : [{"stem":"fogalmam sincs","affix":["en"]},{"stem":"(m[eé]g)?[ns]em?\stud(hat)?o\w+","wordclass":"regex"},{"stem":"hon+an.+?tud(jam|(hat)?n[aá]m)","wordclass":"regex"}],
24-
"dontunderstand": [{"stem":"(m[eé]g)?[ns]em?\s([eé]rte(t+e)?[lm](ek)?|v[aá]gom|hal+[ao](t+a)?[km])","wordclass":"regex"},{"stem":"(mit|hogy(an)?)\s([eé]rte(t+[eé])?|mond(t[aá])?o?)(sz|d|l)","wordclass":"regex"},{"stem":"meg\s?ism[eé]tel(het)?n\w+","wordclass":"regex"}],
24+
"dontunderstand": [{"stem":"(m[eé]g)?[ns]em?\s([eé]rte(t+e)?[lm](ek)?|v[aá]gom|hal+[ao](t+a)?[km])","wordclass":"regex"},{"stem":"(mit|hogy(an)?)\s([eé]rte(t+[eé])?|mond(t[aá])?o?)(sz|d|l)","wordclass":"regex"},{"stem":"meg\s?ism[eé]tel(het)?n\w+","wordclass":"regex"},{"stem":"ism[eé]tel[dj]\w*\s?meg","wordclass":"regex"}],
2525
}
2626

2727
# menu commands
@@ -117,7 +117,7 @@ def smalltalk():
117117
"about_name" : [{"stem":"(mond*(ja)?\ski|mi\sa)\s(bece)nev[eé][dt](et)?","wordclass":"regex"},{"stem":"(hogy(an)?|minek)\s(is\s)?(h[ií]v([jn][aá](la)?k|hatom)|nevez+(nek|elek))","wordclass":"regex"},{"stem":"(mi?[eé]rt\s|hogy[\s\-]?hogy\s)(let+\s)?(pont\s)?(ezt?\s(let+\s)?(a\s)?|[ií]gy\s|ilyen\s)(nevez[nt]ek|h[ií]v[nt]ak|neved|nevet\s(kapt[aá][dl]|adt[aá]k))","wordclass":"regex"},{"stem":"mi\sa\s(bece)?neved?","wordclass":"regex","exc":[{"stem":"az|[ae]n+[ae]k|amiben?|amelyik\w*","wordclass":"regex"}]}],
118118
"about_you" : [{"stem":"(mes[eé]lj|besz[eé]lj|mondj)([eo]n)?.+?mag(ad|[aá])r[oó]l","wordclass":"regex"},{"stem":"mutatkoz+([aá]l|on)?\s+be","wordclass":"regex"},{"stem":"(be)?muta(koz(hat)?n[aá]l|(tn[aá]d|sd)\s.+?magad(at)?)","wordclass":"regex"},{"stem":"([km]i(\s|\sa\s.+?)vagy te|te [km]i(\s|\sa\s.+?)vagy)","wordclass":"regex"}],
119119
"about_creator" : [{"stem":"(ki|hogy(an)?)\s(a\s)?(k[eé]sz([ií]t([oöő]d|et+(ek)?)|[uü]lt([eé]l)?)|gazd[aá]d|programoz([oó]d|ot+|tak)|[ií]rt[aá]k?|(hoz(ot+|tak)|j[oö]t+[eé]l).+?(l[eé]tre|vil[aá]gra|k[oó]dod(at)?)|alkot([oó][dt]+|tak)|teremt(et+|[oöő]d)|(keresztelt|nevezet+|adtak)\sel|adot+\s(neked\s)?nevet)","wordclass":"regex"}],
120-
"about_look" : [{"stem":"hogy(an)?\s(n[eé]zn?[eé]l\ski|mutatsz|festesz)","wordclass":"regex"},{"stem":"(k[uü]ldj|mutas+).+?(k[eé]pet|fot[oó]t|sz?elfie?t)\smagadr[oó]l","wordclass":"regex"},{"stem":"(k[uü]ldj|mutas+)\smagadr[oó]l.+?(k[eé]pet|fot[oó]t|sz?elfie?t)","wordclass":"regex"},{"stem":"(van|milyen)\s(az?\s)?(arcod|kin[eé]zeted)","wordclass":"regex"}],
120+
"about_look" : [{"stem":"hogy(an)?\s(n[eé]zn?[eé]l\ski|mutatsz|festesz)","wordclass":"regex"},{"stem":"(k[uü]ldj|mutas+).+?(k[eé]pet|fot[oó]t|sz?elfie?t)\smagadr[oó]l","wordclass":"regex"},{"stem":"(k[uü]ldj|mutas+)\smagadr[oó]l.+?(k[eé]pet|fot[oó]t|sz?elfie?t)","wordclass":"regex"},{"stem":"(van|milyen)\s(az?\s)?(arcod|kin[eé]zeted)","wordclass":"regex"},{"stem":"szép vagy"}],
121121
"about_age" : [{"stem":"mennyi idős vagy"},{"stem":"hány éves vagy"},{"stem":"melyik évben születtél"},{"stem":"mikor születtél"},{"stem":"(melyik\s[eé]vben|mikor)\sk[eé]sz([uü]lt[eé]l|[ií]tet+ek)","wordclass":"regex"},{"stem":"(h[aá]ny(adik|ban)|mikor\s(van|[uü]n+epled)\s?a?)\ssz[uü]l(et[eé]s|i)napod(at)?","wordclass":"regex"},{"stem":"h[aá]ny\s[eé]vesnek\s.+?\smagad(at)?","wordclass":"regex"},{"stem":"sz[uü]l(et[eé]s)?i?napod(at)?\s(h[aá]nyadik[aá]n|mikor|melyik)","wordclass":"regex"}],
122122
"about_zodiac" : [{"stem":"(neked\s)?mi\sa\s(horoszk[oó]pod|csil+agjegyed)","wordclass":"regex"},{"stem":"milyen jegyben születtél"},{"stem":"a\s(te\s)?(horoszk[oó]pod|csil+agjegyed)\smi(csoda)?","wordclass":"regex"},{"stem":"milyen\sjegyben\ssz[uü]let+\w+","wordclass":"regex"}],
123123
"about_location": [{"stem":"(hol|helyen)\s(k[eé]sz[uü]lt[eé]l|k[eé]sz[ií]tet+ek|sz[uü]let+[eé]l|(hoztak|j[oö]t+[eé]l).+?l[eé]tre)","wordclass":"regex"},{"stem":"hon+an\s(sz[aá]rmazol|[ií]rsz|val[oó]\svagy)","wordclass":"regex"},{"stem":"ho(n+an|l)\svagy\s(most\s)?(helyileg|most|pontosan)","wordclass":"regex"},{"stem":"(hol\s|mer+e\s)(laksz|(van|az?).+?ot+honod)","wordclass":"regex"}],
@@ -133,7 +133,8 @@ def smalltalk():
133133
"are_you_serious": [{"stem":"(nem?|csak)\s(vic+el(sz|j)?|mond+(od|ja)?|ideges[ií]ts(en)?)","wordclass":"regex"},{"stem":"(komolyan|t[eé]nyleg)\s?([uúií]gy\s|azt\s)?((mond|gondol|[ií]r)(ja|od|tad?)|hisz(i|ed)|hit+ed?)","wordclass":"regex"},{"stem":"biztos(an)?\s(vagy\s)?(\w+\s)?(ben+e|eb+en|mond(ta|o)d|mond[jt]a)","wordclass":"regex"},{"stem":"ezt?\s(most\s)?komoly(an)?","wordclass":"regex"}],
134134
"can_you_hear_me": [{"stem":"(olvas+a|hal+ja|n[eé]zi|van\sit+)(\sezt)?\s(vala|b[aá]r)ki(\sis)?","wordclass":"regex"},{"stem":"(hal+(asz|od)|l[aá]t(sz|od)|vesze[ld])\s(engem|a?mit\s(mondok|[ií]rok|k[eé]rdezek))","wordclass":"regex"},{"stem":"valaki\s(hal+(ja)?\s|olvas+a|figyeli?(\sar+a)?)\sa?mit\s(ide\s?|it+\s)?([ií]rok|mondok|k[eé]rdezek)","wordclass":"regex"},{"stem":"felfogtad","max_words":3}],
135135
"can_you_learn": [{"stem":"(k[eé]pes(\svagy)?|tud(sz)?)\stanulni","wordclass":"regex"},{"stem":"tanulsz\s(is|[ae].+?b[oóöő]l)","wordclass":"regex"},{"stem":"[dln][aáeéo][km]\s(be|meg)?tan[ií]tani\b","wordclass":"regex","boundary":False}],
136-
"can_you_understand_me":[{"stem":"(meg)?[eé]rt(e(d|sz|t+ed?)|i)\,?((\shogy)?\sa?mit\s([ií]r|mond)\w+|\smagyarul)","wordclass":"regex"}]
136+
"can_you_understand_me":[{"stem":"(meg)?[eé]rt(e(d|sz|t+ed?)|i)\,?((\shogy)?\sa?mit\s([ií]r|mond)\w+|\smagyarul)","wordclass":"regex"}],
137+
"contact" : [{"stem":"mi(lyen)?\s(.+?\s)?(e\-?mail\s?)?c[ií]me[dn]?","wordclass":"regex"},{"stem":"elérhetőség","wordclass":"noun"},{"stem":"elér","wordclass":"verb","inc":[{"stem":"önt"},{"stem":"téged"}]}]
137138
}
138139

139140
# smiley and emoji references

lara/parser.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,11 @@ def _generate(self, item):
9999
item['wordclass'] = item['wordclass'].lower()
100100
else:
101101
item['wordclass'] = 'special'
102+
if item['wordclass'] == 'regex':
103+
item['stem'] = item['stem'].replace("\b","\\b")
104+
if 'typo_stem' in item:
105+
item['typo_stem'] = item['typo_stem'].replace("\b","\\b")
106+
102107
if 'typo_stem' not in item:
103108
if item['wordclass'] in ('regex','emoji'):
104109
item['typo_stem'] = item['stem']

0 commit comments

Comments
 (0)