Skip to content

Commit dac36f4

Browse files
committed
- fixed minor bugs in 1.0.4
- updated parser Extract() _convert_numbers() to let endings in numbers remain - updated parser Extract() times() - updated parser Extract() durations() - updated parser Extract() timestamps() - updated test cases for parser to match results from updated _convert_numbers() funciton - increased version to 1.0.4
1 parent 7c78976 commit dac36f4

File tree

3 files changed

+21
-14
lines changed

3 files changed

+21
-14
lines changed

lara/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Lara - Lingusitic Aim Recognizer API
44

55
__all__ = 'nlp','parser','tippmix','entities'
6-
__version__ = '1.0.3'
6+
__version__ = '1.0.4'
77
__version_info__ = tuple(int(num) for num in __version__.split('.'))
88

99
import sys

lara/parser.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -681,9 +681,9 @@ def dates(self,normalize=True,convert=True):
681681
# extract times like 12:00 or délután 4
682682
def times(self,normalize=True,convert=True,current=False):
683683
if self.text:
684-
matches = _re.findall(r'((?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?)?\,?\s?(?:[12345]?\d\s?perc+el\s)?(?:(?:h[aá]rom)?negyed\s?|f[eé]l\s?)?(?:[012]?\d|d[eé]l\w*|[eé]jf[eé]l\w*)\s?(?:\:\s?|\-?kor\s?|\-?t[oóöő]l|\-?ig|\-?r[ae]|[oó]r[aá]\w{0,3}\s?)?(?:el[oöő]t+\s?|ut[aá]n\s?)?(?:[0123456]?\d[\-\s]?(?:kor|t[oóöő]l|ig|r[ae]|perc\w{0,3})?)?\,?\s?(?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(?(1)(reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?))?)', re.IGNORECASE, self._ntext_ if convert else self._text_)
684+
matches = _re.findall(r'((?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?)?\,?\s?(?:[12345]?\d\s?perc+el\s)?(?:(?:h[aá]rom)?negyed\s?|f[eé]l\s?)?(?:[012]?\d|d[eé]l\w*|[eé]jf[eé]l\w*)\s?(?:\:\s?|k[oö]z[oö]t+|\-?kor\s?|\-?t[oóöő]l|\-?ig?|\-?r[ae]|[oó]r[aá]\w{0,3}\s?)?(?:el[oöő]t+\s?|ut[aá]n\s?)?(?:[0123456]?\d[\-\s]?(?![cmntvz][ae]l)(?:kor|t[oóöő]l|ig?|r[ae]|perc\w{0,3})?)?\,?\s?(?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(?(1)(reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?))?)', re.IGNORECASE, self._ntext_ if convert else self._text_)
685+
results = []
685686
if normalize:
686-
results = []
687687
last_pm = None
688688
for _item in matches:
689689
item = _item[0]
@@ -695,8 +695,8 @@ def times(self,normalize=True,convert=True,current=False):
695695
zero = False
696696
elott = False
697697
del_matches = _re.findall(r'd[eé]l\w*|[eé]jf[eé]l\w*', re.IGNORECASE, item)
698-
hour_matches = _re.findall(r'\D([012]?\d(?!\d))\D*?(?!perc)(?:\:|\-?kor|\-?t[oóöő]l|\-?ig|\-?r[ae]|[oó]r[aá]\w*)?', re.IGNORECASE, item)
699-
minute_matches = _re.findall(r'(?!negyed|f[eé]l)\D([0123456]?\d(?!\d))\D*?(?![oó]r[aá])(?:\-?kor|\-?t[oóöő]l|\-?ig|\-?r[ae]|perc\w*)?', re.IGNORECASE, item)
698+
hour_matches = _re.findall(r'\D([012]?\d(?!\d))\D*?(?!perc)(?:\:\s?|k[oö]z[oö]t+|\-?kor|\-?t[oóöő]l|\-?ig?|\-?r[ae]|[oó]r[aá]\w*)?', re.IGNORECASE, item)
699+
minute_matches = _re.findall(r'(?!negyed|f[eé]l)\D([0123456]?\d(?!\d))\D*?(?![oó]r[aá])(?:\-?kor|\-?t[oóöő]l|\-?ig?|\-?r[ae]|perc\w*)?', re.IGNORECASE, item)
700700
quarter_matches= _re.findall(r'((?:h[aá]rom)?negyed|f[eé]l)', re.IGNORECASE, item)
701701
am_matches = _re.findall(r'(reggel|hajnal|d[eé]lel[oöő]t|d\.?e\.?)', re.IGNORECASE, item)
702702
pm_matches = _re.findall(r'(d[eé]lut[aá]n|d\.?u\.?|este|[eé]j+el)', re.IGNORECASE, item)
@@ -787,15 +787,22 @@ def times(self,normalize=True,convert=True,current=False):
787787
results.append('00:00')
788788
else:
789789
results.append('12:00')
790-
return results
791790
else:
792-
return [item[0].strip() for item in matches if len(item[0].strip())>4]
791+
for item in matches:
792+
item = item[0].strip()
793+
ok = False
794+
for char in item:
795+
if not char.isnumeric():
796+
ok = True
797+
if item and ok:
798+
results.append(item)
799+
return results
793800
return []
794801

795802
# extract list of time durations
796803
def durations(self,normalize=True,convert=True):
797804
if self.text:
798-
matches = _re.findall(r'\b((?:(?:(?:\d\s?)+(?:[\.\,]\d+)?\s(?:(?:[eé]s\s)?(?:f[eé]l|(?:h[aá]rom)?negyed)\s)?(?:(?:(?:t[ií]zed|sz[aá]zad|ezred)?m[aá]sod)?perc\w{0,3}|[oó]r[aá]\w{0,3}|nap\w{0,3}|7|h[eé]t\w{0,3}|h[oó]nap\w{0,3}|[eé]v\w{0,3})(?:\s(?:m[uú]lva|r[aá]|(?:ez)?el[oöő]t+|el[oöő]b+|k[eé]s[oö]b+|bel[uü]l|h[aá]tr(?:a|[eé]bb)|vissza|el[oöő]re))?)(?:\W{1,2}(?:[eé]s|meg)?\W*)?)+)', re.IGNORECASE, self.ntext if convert else self.text)
805+
matches = _re.findall(r'\b((?:(?:(?:\d\s?)+(?:[\.\,]\d+)?\s(?:(?:[eé]s\s)?(?:f[eé]l|(?:h[aá]rom)?negyed)\s)?(?:(?:(?:t[ií]zed|sz[aá]zad|ezred)?m[aá]sod)?perc\w{0,3}|[oó]r[aá]\w{0,3}|nap\w{0,3}|7\w{0,3}|h[eé]t\w{0,3}|h[oó]nap\w{0,3}|[eé]v\w{0,3})(?:\s(?:m[uú]lva|r[aá]|(?:ez)?el[oöő]t+|el[oöő]b+|k[eé]s[oö]b+|bel[uü]l|h[aá]tr(?:a|[eé]bb)|vissza|el[oöő]re))?)(?:\W{1,2}(?:[eé]s|meg)?\W*)?)+)', re.IGNORECASE, self.ntext if convert else self.text)
799806
if normalize:
800807
results = []
801808
now = datetime.datetime.now()
@@ -1027,13 +1034,13 @@ def timestamps(self,current=False):
10271034
relative_pos= []
10281035
times_pos = []
10291036
for item in dates:
1030-
for match in _re.finditer(r'\b'+re.escape(item), re.IGNORECASE, self.text):
1037+
for match in _re.finditer(r'\b'+re.escape(item), re.IGNORECASE, self.ntext):
10311038
dates_pos.append(match.span()[0])
10321039
for item in relative:
1033-
for match in _re.finditer(r'\b'+re.escape(item), re.IGNORECASE, self.text):
1040+
for match in _re.finditer(r'\b'+re.escape(item), re.IGNORECASE, self.ntext):
10341041
relative_pos.append(match.span()[0])
10351042
for item in times:
1036-
for match in _re.finditer(r'\b'+re.escape(item), re.IGNORECASE, self.text):
1043+
for match in _re.finditer(r'\b'+re.escape(item), re.IGNORECASE, self.ntext):
10371044
times_pos.append(match.span()[0])
10381045
dates_pos.append(-1)
10391046
relative_pos.append(-1)
@@ -1168,7 +1175,7 @@ def _convert_numbers(self,text):
11681175

11691176
swap = sorted(results.items(), key=lambda x: x[1], reverse=True)
11701177
for item in swap:
1171-
text = _re.sub(r'\b('+re.escape(item[0])+r')(?:[aeoö]dik?)?(?:j?[aáeéi]+[gnt]?|[aáeéoöő]?t|kor|t[oóöő]l|r[ae]|[ckmrtvz]?[ae]l)?\b', re.IGNORECASE, str(item[1]), text)
1178+
text = _re.sub(r'\b('+re.escape(item[0])+r')((?:[aeoö]dik?)?(?:j?[aáeéi]+[gnt]?|[aáeéoöő]?t|kor|t[oóöő]l|r[ae]|[ckmrtvz]?[ae]l)?)?\b', re.IGNORECASE, re.escape(str(item[1]))+r'\2', text)
11721179
return text
11731180
return ''
11741181

tests/test_parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ def test_parser_extract(info):
402402
(
403403
{
404404
"in" : "harmincnégy lol első a második harmadik :D negyed végén ötödikén mit más csinálsz tízenkétmillióhatvanezerhetvenegy és hárommillió száz huszonkettő vagy még nullamilliárd de akkor már kettő kettő tizenkettő :) harmincnégy és nyolcvan illetve kilencvenezer az állás pedig egy-egy és végül egy kettő három",
405-
"out" : "34 lol 1 a 2 3 :D negyed végén 5 mit más csinálsz 12060071 és 3000122 vagy még 0 de akkor már 2212 :) 34 és 80 illetve 90000 az állás pedig 1-1 és végül 1 2 3"
405+
"out" : "34 lol 1 a 2 3 :D negyed végén 5ödikén mit más csinálsz 12060071 és 3000122 vagy még 0 de akkor már 2212 :) 34 és 80 illetve 90000 az állás pedig 1-1 és végül 1 2 3"
406406
}
407407
),
408408
(
@@ -614,7 +614,7 @@ def test_parser_extract_convert_numbers(info):
614614
"text" : "3 óra és 4 perc múlva valamint majd egyszer egy héttel rá",
615615
"function" : "durations",
616616
"args" : [False],
617-
"result" : ['3 óra és 4 perc múlva', '1 7 rá']
617+
"result" : ['3 óra és 4 perc múlva', '1 7tel rá']
618618
}
619619
),
620620
(

0 commit comments

Comments
 (0)