@@ -681,9 +681,9 @@ def dates(self,normalize=True,convert=True):
681
681
# extract times like 12:00 or délután 4
682
682
def times (self ,normalize = True ,convert = True ,current = False ):
683
683
if self .text :
684
- matches = _re .findall (r'((?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?)?\,?\s?(?:[12345]?\d\s?perc+el\s)?(?:(?:h[aá]rom)?negyed\s?|f[eé]l\s?)?(?:[012]?\d|d[eé]l\w*|[eé]jf[eé]l\w*)\s?(?:\:\s?|\-?kor\s?|\-?t[oóöő]l|\-?ig|\-?r[ae]|[oó]r[aá]\w{0,3}\s?)?(?:el[oöő]t+\s?|ut[aá]n\s?)?(?:[0123456]?\d[\-\s]?(?:kor|t[oóöő]l|ig|r[ae]|perc\w{0,3})?)?\,?\s?(?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(?(1)(reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?))?)' , re .IGNORECASE , self ._ntext_ if convert else self ._text_ )
684
+ matches = _re .findall (r'((?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?)?\,?\s?(?:[12345]?\d\s?perc+el\s)?(?:(?:h[aá]rom)?negyed\s?|f[eé]l\s?)?(?:[012]?\d|d[eé]l\w*|[eé]jf[eé]l\w*)\s?(?:\:\s?|k[oö]z[oö]t+|\-?kor\s?|\-?t[oóöő]l|\-?ig?|\-?r[ae]|[oó]r[aá]\w{0,3}\s?)?(?:el[oöő]t+\s?|ut[aá]n\s?)?(?:[0123456]?\d[\-\s]?(?![cmntvz][ae]l)(?:kor|t[oóöő]l|ig?|r[ae]|perc\w{0,3})?)?\,?\s?(?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(?(1)(reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?))?)' , re .IGNORECASE , self ._ntext_ if convert else self ._text_ )
685
+ results = []
685
686
if normalize :
686
- results = []
687
687
last_pm = None
688
688
for _item in matches :
689
689
item = _item [0 ]
@@ -695,8 +695,8 @@ def times(self,normalize=True,convert=True,current=False):
695
695
zero = False
696
696
elott = False
697
697
del_matches = _re .findall (r'd[eé]l\w*|[eé]jf[eé]l\w*' , re .IGNORECASE , item )
698
- hour_matches = _re .findall (r'\D([012]?\d(?!\d))\D*?(?!perc)(?:\:| \-?kor|\-?t[oóöő]l|\-?ig|\-?r[ae]|[oó]r[aá]\w*)?' , re .IGNORECASE , item )
699
- minute_matches = _re .findall (r'(?!negyed|f[eé]l)\D([0123456]?\d(?!\d))\D*?(?![oó]r[aá])(?:\-?kor|\-?t[oóöő]l|\-?ig|\-?r[ae]|perc\w*)?' , re .IGNORECASE , item )
698
+ hour_matches = _re .findall (r'\D([012]?\d(?!\d))\D*?(?!perc)(?:\:\s?|k[oö]z[oö]t+| \-?kor|\-?t[oóöő]l|\-?ig? |\-?r[ae]|[oó]r[aá]\w*)?' , re .IGNORECASE , item )
699
+ minute_matches = _re .findall (r'(?!negyed|f[eé]l)\D([0123456]?\d(?!\d))\D*?(?![oó]r[aá])(?:\-?kor|\-?t[oóöő]l|\-?ig? |\-?r[ae]|perc\w*)?' , re .IGNORECASE , item )
700
700
quarter_matches = _re .findall (r'((?:h[aá]rom)?negyed|f[eé]l)' , re .IGNORECASE , item )
701
701
am_matches = _re .findall (r'(reggel|hajnal|d[eé]lel[oöő]t|d\.?e\.?)' , re .IGNORECASE , item )
702
702
pm_matches = _re .findall (r'(d[eé]lut[aá]n|d\.?u\.?|este|[eé]j+el)' , re .IGNORECASE , item )
@@ -787,15 +787,22 @@ def times(self,normalize=True,convert=True,current=False):
787
787
results .append ('00:00' )
788
788
else :
789
789
results .append ('12:00' )
790
- return results
791
790
else :
792
- return [item [0 ].strip () for item in matches if len (item [0 ].strip ())> 4 ]
791
+ for item in matches :
792
+ item = item [0 ].strip ()
793
+ ok = False
794
+ for char in item :
795
+ if not char .isnumeric ():
796
+ ok = True
797
+ if item and ok :
798
+ results .append (item )
799
+ return results
793
800
return []
794
801
795
802
# extract list of time durations
796
803
def durations (self ,normalize = True ,convert = True ):
797
804
if self .text :
798
- matches = _re .findall (r'\b((?:(?:(?:\d\s?)+(?:[\.\,]\d+)?\s(?:(?:[eé]s\s)?(?:f[eé]l|(?:h[aá]rom)?negyed)\s)?(?:(?:(?:t[ií]zed|sz[aá]zad|ezred)?m[aá]sod)?perc\w{0,3}|[oó]r[aá]\w{0,3}|nap\w{0,3}|7|h[eé]t\w{0,3}|h[oó]nap\w{0,3}|[eé]v\w{0,3})(?:\s(?:m[uú]lva|r[aá]|(?:ez)?el[oöő]t+|el[oöő]b+|k[eé]s[oö]b+|bel[uü]l|h[aá]tr(?:a|[eé]bb)|vissza|el[oöő]re))?)(?:\W{1,2}(?:[eé]s|meg)?\W*)?)+)' , re .IGNORECASE , self .ntext if convert else self .text )
805
+ matches = _re .findall (r'\b((?:(?:(?:\d\s?)+(?:[\.\,]\d+)?\s(?:(?:[eé]s\s)?(?:f[eé]l|(?:h[aá]rom)?negyed)\s)?(?:(?:(?:t[ií]zed|sz[aá]zad|ezred)?m[aá]sod)?perc\w{0,3}|[oó]r[aá]\w{0,3}|nap\w{0,3}|7\w{0,3} |h[eé]t\w{0,3}|h[oó]nap\w{0,3}|[eé]v\w{0,3})(?:\s(?:m[uú]lva|r[aá]|(?:ez)?el[oöő]t+|el[oöő]b+|k[eé]s[oö]b+|bel[uü]l|h[aá]tr(?:a|[eé]bb)|vissza|el[oöő]re))?)(?:\W{1,2}(?:[eé]s|meg)?\W*)?)+)' , re .IGNORECASE , self .ntext if convert else self .text )
799
806
if normalize :
800
807
results = []
801
808
now = datetime .datetime .now ()
@@ -1027,13 +1034,13 @@ def timestamps(self,current=False):
1027
1034
relative_pos = []
1028
1035
times_pos = []
1029
1036
for item in dates :
1030
- for match in _re .finditer (r'\b' + re .escape (item ), re .IGNORECASE , self .text ):
1037
+ for match in _re .finditer (r'\b' + re .escape (item ), re .IGNORECASE , self .ntext ):
1031
1038
dates_pos .append (match .span ()[0 ])
1032
1039
for item in relative :
1033
- for match in _re .finditer (r'\b' + re .escape (item ), re .IGNORECASE , self .text ):
1040
+ for match in _re .finditer (r'\b' + re .escape (item ), re .IGNORECASE , self .ntext ):
1034
1041
relative_pos .append (match .span ()[0 ])
1035
1042
for item in times :
1036
- for match in _re .finditer (r'\b' + re .escape (item ), re .IGNORECASE , self .text ):
1043
+ for match in _re .finditer (r'\b' + re .escape (item ), re .IGNORECASE , self .ntext ):
1037
1044
times_pos .append (match .span ()[0 ])
1038
1045
dates_pos .append (- 1 )
1039
1046
relative_pos .append (- 1 )
@@ -1168,7 +1175,7 @@ def _convert_numbers(self,text):
1168
1175
1169
1176
swap = sorted (results .items (), key = lambda x : x [1 ], reverse = True )
1170
1177
for item in swap :
1171
- text = _re .sub (r'\b(' + re .escape (item [0 ])+ r')(?:[aeoö]dik?)?(?:j?[aáeéi]+[gnt]?|[aáeéoöő]?t|kor|t[oóöő]l|r[ae]|[ckmrtvz]?[ae]l)?\b' , re .IGNORECASE , str (item [1 ]), text )
1178
+ text = _re .sub (r'\b(' + re .escape (item [0 ])+ r')(( ?:[aeoö]dik?)?(?:j?[aáeéi]+[gnt]?|[aáeéoöő]?t|kor|t[oóöő]l|r[ae]|[ckmrtvz]?[ae]l)?)? \b' , re .IGNORECASE , re . escape ( str (item [1 ])) + r'\2' , text )
1172
1179
return text
1173
1180
return ''
1174
1181
0 commit comments