Skip to content

Commit 6f37fce

Browse files
committed
- added relative_dates() to parser Extract()
- added relative_dates() function to parser Extract() to convert days to date format - updated times() function in parser Extract() - updated _convert_numbers() function in parser Extract() - updated test cases and added new test cases for parser Extract() relative_Dates() - updated entities based on user feedback - increased version to 1.0.3
1 parent b8e601d commit 6f37fce

File tree

4 files changed

+132
-70
lines changed

4 files changed

+132
-70
lines changed

lara/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Lara - Lingusitic Aim Recognizer API
44

55
__all__ = 'nlp','parser','tippmix','entities'
6-
__version__ = '1.0.2'
6+
__version__ = '1.0.3'
77
__version_info__ = tuple(int(num) for num in __version__.split('.'))
88

99
import sys

lara/entities.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def common():
1818
"command" : [{"stem":"(csin[aá]l(jad?|d)|(keres|mutas|mond)[aedjos]+n?|n[eé]z[nz]?[eé]?[dl]|akaro[km]|utas[ií]t\w{1,})","wordclass":"regex"},{"stem":"haj[cts]+(a|[aá]?[ld])\sv[eé]gre","wordclass":"regex"}],
1919
"question" : [{"stem":"(\?+$)|(\?+\s\w+)","wordclass":"regex"},{"stem":"([^,][^,\S+]hogy|^hogy)(an)?","wordclass":"regex"},{"stem":"hol"},{"stem":"honnan"},{"stem":"hová"},{"stem":"hány","affix":["an","at","ból"]},{"stem":"mettől"},{"stem":"meddig"},{"stem":"merre"},{"stem":"mennyi","affix":["en","re"]},{"stem":"mi","affix":["t","k","ket","kor","korra","lyen","lyenek","nek","től","kortól","korra","ből","hez","re","vel"]},{"stem":"ki(k?(e?t|nek|[bt][oöő]l|hez|re|[kv]el)|\saz?)","wordclass":"regex"}],
2020
"conditional" : [{"stem":"(meg)?(vol|tud|[lt]en?)n[aáeé][dl]?","wordclass":"regex"},{"stem":"\w+h[ae]t\w+","wordclass":"regex"}],
21-
"profanity" : [{"stem":"(fel|le|meg|r[aá]|ki|be|oda|[oö]s+ze|bele|hoz+[aá])?bas*z+d?\s?(at)?(hat)?(us|a[dk]?|n?[aá][kl]|[aá]?t[aáo][lkm]?|ot+|ni|n[aá]n?[dlkm]?|va|meg)?","wordclass":"regex","exc":[{"stem":"megye"}]},{"stem":"fasz","prefix":["ló","agy"],"wordclass":"noun"},{"stem":"fasza","wordclass":"adjective"},{"stem":"geci","wordclass":"noun"},{"stem":"kurva","affix":["élet","anya","anyja","annya"],"wordclass":"noun"},{"stem":"hülye","wordclass":"adjective"},{"stem":"pi(n|cs)[aá][dk]?(a?t|nak|ban?|[bt][oó]l|[eé]rt)?","wordclass":"regex"},{"stem":"((bekap(ja?|hato?|n[aái])?d?)|(kap.*?be))","wordclass":"regex"},{"stem":"(le)?szop(sz|ol|[jn][aá][dl]|hat(sz|n[aá]l|o[dl]))(\s?(le|ki))?","wordclass":"regex"},{"stem":"(geci|kurva)?(fos|szar)\w{0,3}","wordclass":"regex"}],
21+
"profanity" : [{"stem":"(fel|le|meg|r[aá]|ki|be|oda|[oö]s+ze|bele|hoz+[aá])?bas*z+d?(at)?(hat)?\s?(us|a[dk]?|n?[aá][kl]|[aá]?t[aáo][lkm]?|ot+|ni|n[aá]n?[dlkm]?|va|meg|ki)?","wordclass":"regex","exc":[{"stem":"megye"}]},{"stem":"fasz","prefix":["ló","agy"],"wordclass":"noun"},{"stem":"fasza","wordclass":"adjective"},{"stem":"geci","wordclass":"noun"},{"stem":"kurva","affix":["élet","anya","anyja","annya"],"wordclass":"noun"},{"stem":"hülye","wordclass":"adjective"},{"stem":"pi(n|cs)[aá][dk]?(a?t|nak|ban?|[bt][oó]l|[eé]rt)?","wordclass":"regex"},{"stem":"((bekap(ja?|hato?|n[aái])?d?)|(kap.*?be))","wordclass":"regex"},{"stem":"(le)?szop(sz|ol|[jn][aá][dl]|hat(sz|n[aá]l|o[dl]))(\s?(le|ki))?","wordclass":"regex"},{"stem":"(geci|kurva)?(fos|szar)\w{0,3}","wordclass":"regex"}],
2222
"welldone" : [{"stem":"fasza"},{"stem":"nagyszerű"},{"stem":"remek","max_words":5},{"stem":"jó","prefix":["kurva"],"exc":[{"stem":"nincs"},{"stem":"nem"},{"stem":"éjt"},{"stem":"reggelt"},{"stem":"napot"},{"stem":"estét"},{"stem":"éjszakát"}]},{"stem":"j[oó]l\s?van","wordclass":"regex"},{"stem":"király"},{"stem":"ügyes"},{"stem":"(sz[eé]p\s(volt|munka))|(ez\s(lesz\s)?az)|(sz?uper)|zs[ií]r","wordclass":"regex"},{"stem":"👍","wordclass":"emoji"},{"stem":"\(Y\)","wordclass":"regex","boundary":False},{"stem":"profi vagy"},{"stem":"fant[aoö](rp|sz?t)i[ck](us)?(an)?","wordclass":"regex"},{"stem":"szeretem","inc":[{"stem":"amikor"},{"stem":"ahogy"}],"exc":[{"stem":"nem"}]}],
2323
"dontknow" : [{"stem":"fogalmam sincs","affix":["en"]},{"stem":"(m[eé]g)?[ns]em?\stud(hat)?o\w+","wordclass":"regex"},{"stem":"hon+an.+?tud(jam|(hat)?n[aá]m)","wordclass":"regex"}],
2424
"dontunderstand": [{"stem":"(m[eé]g)?[ns]em?\s([eé]rte(t+e)?[lm](ek)?|v[aá]gom|hal+[ao](t+a)?[km])","wordclass":"regex"},{"stem":"(mit|hogy(an))\s([eé]rte(t+[eé])?|mond(t[aá])?o?)(sz|d|l)","wordclass":"regex"},{"stem":"meg\s?ism[eé]tel(het)?n\w+","wordclass":"regex"}],

lara/parser.py

Lines changed: 74 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -679,22 +679,23 @@ def dates(self,normalize=True,convert=True):
679679
return results
680680

681681
# extract times like 12:00 or délután 4
682-
def times(self,normalize=True,convert=True,current=-1):
682+
def times(self,normalize=True,convert=True,current=False):
683683
if self.text:
684-
matches = _re.findall(r'((?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(?:reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?)?\,?\s?(?:[12345]?\d\s?perc+el\s)?(?:(?:h[aá]rom)?negyed\s?|f[eé]l\s?)?[012]?\d\s?(?:\:\s?|\-?kor\s?|[oó]r[aá]\w{0,3}\s?)?(?:el[oöő]t+\s?|ut[aá]n\s?)?(?:[0123456]?\d[\-\s]?(?:kor|perc\w{0,3})?)?\,?\s?(?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(?:reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?)?)', re.IGNORECASE, self._ntext_ if convert else self._text_)
684+
matches = _re.findall(r'((?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?)?\,?\s?(?:[12345]?\d\s?perc+el\s)?(?:(?:h[aá]rom)?negyed\s?|f[eé]l\s?)?[012]?\d\s?(?:\:\s?|\-?kor\s?|\-?t[oóöő]l|\-?ig|\-?r[ae]|[oó]r[aá]\w{0,3}\s?)?(?:el[oöő]t+\s?|ut[aá]n\s?)?(?:[0123456]?\d[\-\s]?(?:kor|t[oóöő]l|ig|r[ae]|perc\w{0,3})?)?\,?\s?(?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(?(1)(reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?))?)', re.IGNORECASE, self._ntext_ if convert else self._text_)
685685
if normalize:
686686
results = []
687-
for item in matches:
687+
for _item in matches:
688+
item = _item[0]
688689
if len(item.strip())>2:
689-
item = ' '+item+' '
690-
hour = "00"
690+
item = ' '+item.lower()+' '
691+
hour = "00"
691692
minute = "00"
692693
pm = False
693-
zero = False
694-
elott = False
695-
hour_matches = _re.findall(r'\D([012]?\d(?!\d))\D*?(?!perc)(?:\:|\-?kor|[oó]r[aá])?', re.IGNORECASE, item)
696-
minute_matches = _re.findall(r'(?!negyed|f[eé]l)\D([0123456]?\d(?!\d))\D*?(?![oó]ra)(?:\-?kor|perc)?', re.IGNORECASE, item)
697-
quarter_matches = _re.findall(r'((?:h[aá]rom)?negyed|f[eé]l)', re.IGNORECASE, item)
694+
zero = False
695+
elott = False
696+
hour_matches = _re.findall(r'\D([012]?\d(?!\d))\D*?(?!perc)(?:\:|\-?kor|\-?t[oóöő]l|\-?ig|\-?r[ae]|[oó]r[aá]\w*)?', re.IGNORECASE, item)
697+
minute_matches = _re.findall(r'(?!negyed|f[eé]l)\D([0123456]?\d(?!\d))\D*?(?![oó]r[aá])(?:\-?kor|\-?t[oóöő]l|\-?ig|\-?r[ae]|perc\w*)?', re.IGNORECASE, item)
698+
quarter_matches= _re.findall(r'((?:h[aá]rom)?negyed|f[eé]l)', re.IGNORECASE, item)
698699
am_matches = _re.findall(r'(reggel|hajnal|d[eé]lel[oöő]t|d\.?e\.?)', re.IGNORECASE, item)
699700
pm_matches = _re.findall(r'(d[eé]lut[aá]n|d\.?u\.?|este|[eé]j+el)', re.IGNORECASE, item)
700701
if len(hour_matches) in (1,2):
@@ -763,15 +764,14 @@ def times(self,normalize=True,convert=True,current=-1):
763764
if pm_matches:
764765
pm = True
765766
elif not am_matches:
766-
if current:
767-
if current>=0:
768-
now = current
769-
else:
770-
now = datetime.datetime.now().hour
771-
if 'holnap' in item and hour<9:
772-
pm = True
773-
elif hour<12 and now>hour:
774-
pm = True
767+
if current is not False:
768+
now = current
769+
else:
770+
now = datetime.datetime.now().hour
771+
if 'holnap' in item and hour<9:
772+
pm = True
773+
elif hour<12 and now>hour:
774+
pm = True
775775
if pm and hour<=12:
776776
hour += 12
777777
hour %= 24
@@ -780,7 +780,7 @@ def times(self,normalize=True,convert=True,current=-1):
780780
results.append(str(hour).zfill(2)+':'+str(minute).zfill(2))
781781
return results
782782
else:
783-
return [item.strip() for item in matches if len(item.strip())>2]
783+
return [item[0].strip() for item in matches if len(item[0].strip())>2]
784784
return []
785785

786786
# extract list of time durations
@@ -795,13 +795,13 @@ def durations(self,normalize=True,convert=True):
795795
val = 0
796796
for sub_item in sub_matches:
797797
match = sub_item.lower().replace(',','.')
798-
sval = ''
798+
sval = ''
799799
for char in match:
800800
if char.isdigit() or char=='.':
801801
sval+=char
802802
else:
803803
break
804-
sval = float(sval)
804+
sval = float(sval)
805805
mpx = 1
806806
if 'tized' in match or 'tízed' in match:
807807
mpx = 0.1
@@ -953,11 +953,57 @@ def emails(self):
953953
return _re.findall(r'\b([\w\d\-\_\.]+\@[\w\d\-\_\.]+\.\w{2,4}(?:\.\w{2,4})?)\b', re.IGNORECASE, self.text)
954954
return []
955955

956+
# extract relative dates like tomorrow or wednesday
957+
def relative_dates(self,normalize=True,current=False):
958+
if self.text:
959+
matches = _re.findall(r'\b((?:(?:meg)?el[oöő]z[oöő]|m[uú]lt|(?:r[aá])?k[oö]vetkez[oöő]|j[oö]v[oöő])?\s?(?:h[eé]t(?:i|en)?\s?)?(?:tegnap(?:el[oöő]t+)?|holnap(?:ut[aá]n)?|m[aá](?:i nap)?|h[eé]tf[oöő]|ked+|szerd[aá]|cs[uü]t[oö]rt[oö]k|p[eé]ntek|szo[nm]bat|vas[aá]rnap))(?:[aáeoö][dm])?(?:ig|r[ae]|t[oóöő]l|[aáeoöő]?t|[dkmnptv][ae][lk]|[aáeoö]?n)?\b', re.IGNORECASE, self.text)
960+
if normalize:
961+
if current is not False:
962+
_now = datetime.datetime.strptime(current,"%Y-%m-%d")
963+
else:
964+
_now = datetime.datetime.now()
965+
results = []
966+
for item in matches:
967+
item = item.lower()
968+
now = _now
969+
if 'holnap' in item:
970+
if 'ut' in item:
971+
now += datetime.timedelta(days = 2)
972+
else:
973+
now += datetime.timedelta(days = 1)
974+
elif 'tegnap' in item:
975+
if 'el' in item:
976+
now += datetime.timedelta(days = -2)
977+
else:
978+
now += datetime.timedelta(days = -1)
979+
elif 'ma' not in item and 'má' not in item:
980+
now -= datetime.timedelta(days = now.weekday())
981+
if _re.findall(r'((?:meg)?el[oöő]z[oöő]|m[uú]lt)', re.IGNORECASE, item):
982+
now -= datetime.timedelta(weeks = 1)
983+
elif _re.findall(r'((?:r[aá])?k[oö]vetkez[oöő]|j[oö]v[oöő])', re.IGNORECASE, item):
984+
now += datetime.timedelta(weeks = 1)
985+
if 'ked' in item:
986+
now += datetime.timedelta(days = 1)
987+
elif 'szerd' in item:
988+
now += datetime.timedelta(days = 2)
989+
elif _re.findall(r'cs[uü]t[oö]rt[oö]k', re.IGNORECASE, item):
990+
now += datetime.timedelta(days = 3)
991+
elif _re.findall(r'p[eé]ntek', re.IGNORECASE, item):
992+
now += datetime.timedelta(days = 4)
993+
elif _re.findall(r'szo[mn]bat', re.IGNORECASE, item):
994+
now += datetime.timedelta(days = 5)
995+
elif _re.findall(r'vas[aá]rnap', re.IGNORECASE, item):
996+
now += datetime.timedelta(days = 6)
997+
results.append(now.strftime('%Y-%m-%d'))
998+
return results
999+
else:
1000+
return [item.strip() for item in matches]
1001+
return []
1002+
9561003
# Converts text representation of numbers to digits
9571004
def _convert_numbers(self,text):
9581005
if text:
959-
#fix = _re.sub(r'(?<=\d)\s+(?=\d)',re.IGNORECASE,'',text.lower())
960-
matches = _re.findall(r'((?:m[ií]n[uú]sz\s?|negat[ií]v\s?)?(?:(?:(?:(?:(?:t[ií]z|h[uú]sz|harminc)(?:[eo]n)?)?(?:nulla|egy|els[eoöő]|k[eé]t+[oöő]?|m[aá]sod(?:ik)?|h[aá]rom|harmadik|n[eé]gy|[oö]t|hat|h[eé]t|nyolc|kilenc)(?:v[ae]n)?)(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z)?\W*)|(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z|t[ií]z|h[uú]sz|harminc|nulla|z[eé]r[oó])\W*)+(?:[aeoö]dik)?(?:j?[aáeéi]+n?)?)\b', re.IGNORECASE, text.lower())
1006+
matches = _re.findall(r'((?:m[ií]n[uú]sz\s?|negat[ií]v\s?)?(?:(?:(?:(?:(?:t[ií]z|h[uú]sz|harminc)(?:[eo]n)?)?(?:nulla|eg+y|els[eoöő]|k[eé]t+[oöő]?|m[aá]sod(?:ik)?|h[aá]rom|harmadik|n[eé]g+y|[oö]t|hat|h[eé]t|nyolc|kilenc)(?:v[ae]n)?)(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z)?\W*)|(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z|t[ií]z|h[uú]sz|harminc|nulla|z[eé]r[oó])\W*)+(?:[aeoö]dik)?(?:j?[aáeéi]+[gnt]?|[aáeéoöő]?t|kor|t[oóöő]l|r[ae]|[ckmrtvz]?[ae]l)?)\b', re.IGNORECASE, text)
9611007
results = {}
9621008
for match in matches:
9631009
value = 0
@@ -968,7 +1014,7 @@ def _convert_numbers(self,text):
9681014
else:
9691015
minus = 1
9701016
minusm = ''
971-
parts = _re.findall(r'((?:(?:(?:(?:t[ií]z|h[uú]sz|harminc)(?:[eo]n)?)?(?:nulla|egy|els[eoöő]|k[eé]t+[oöő]?|m[aá]sod(?:ik)?|h[aá]rom|harmadik|n[eé]gy|[oö]t|hat|h[eé]t|nyolc|kilenc)(?:v[ae]n)?)(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z)?|(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z|t[ií]z|h[uú]sz|harminc|nulla|z[eé]r[oó]))\W*)', re.IGNORECASE, match)
1017+
parts = _re.findall(r'((?:(?:(?:(?:t[ií]z|h[uú]sz|harminc)(?:[eo]n)?)?(?:nulla|eg+y|els[eoöő]|k[eé]t+[oöő]?|m[aá]sod(?:ik)?|h[aá]rom|harmadik|n[eé]g+y|[oö]t|hat|h[eé]t|nyolc|kilenc)(?:v[ae]n)?)(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z)?|(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z|t[ií]z|h[uú]sz|harminc|nulla|z[eé]r[oó]))\W*)', re.IGNORECASE, match)
9721018
values = []
9731019
for part in parts:
9741020
val = 0
@@ -1045,7 +1091,7 @@ def _convert_numbers(self,text):
10451091

10461092
swap = sorted(results.items(), key=lambda x: x[1], reverse=True)
10471093
for item in swap:
1048-
text = _re.sub(r'\b('+re.escape(item[0])+r')(?:[aeoö]dik?)?(?:j?[aáeéi]+n?)?\b', re.IGNORECASE, str(item[1]), text)
1094+
text = _re.sub(r'\b('+re.escape(item[0])+r')(?:[aeoö]dik?)?(?:j?[aáeéi]+[gnt]?|[aáeéoöő]?t|kor|t[oóöő]l|r[ae]|[ckmrtvz]?[ae]l)?\b', re.IGNORECASE, str(item[1]), text)
10491095
return text
10501096
return ''
10511097

@@ -1063,9 +1109,9 @@ def _convert_numbers_helper(self,match,default):
10631109
return 2
10641110
elif _re.findall(r'(harmadik|h[aá]rom)', re.IGNORECASE, match):
10651111
return 3
1066-
elif _re.findall(r'n[eé]gy', re.IGNORECASE, match):
1112+
elif _re.findall(r'n[eé]g+y', re.IGNORECASE, match):
10671113
return 4
1068-
elif 'egy' in match or 'els' in match:
1114+
elif 'egy' in match or 'els' in match or 'eggy' in match:
10691115
return 1
10701116
elif _re.findall(r'[oö]t', re.IGNORECASE, match):
10711117
return 5

0 commit comments

Comments
 (0)