@@ -679,22 +679,23 @@ def dates(self,normalize=True,convert=True):
679
679
return results
680
680
681
681
# extract times like 12:00 or délután 4
682
- def times (self ,normalize = True ,convert = True ,current = - 1 ):
682
+ def times (self ,normalize = True ,convert = True ,current = False ):
683
683
if self .text :
684
- matches = _re .findall (r'((?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(?: reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?)?\,?\s?(?:[12345]?\d\s?perc+el\s)?(?:(?:h[aá]rom)?negyed\s?|f[eé]l\s?)?[012]?\d\s?(?:\:\s?|\-?kor\s?|[ oó]r[aá]\w{0,3}\s?)?(?:el[oöő]t+\s?|ut[aá]n\s?)?(?:[0123456]?\d[\-\s]?(?:kor|perc\w{0,3})?)?\,?\s?(?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(?: reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?)?)' , re .IGNORECASE , self ._ntext_ if convert else self ._text_ )
684
+ matches = _re .findall (r'((?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?)?\,?\s?(?:[12345]?\d\s?perc+el\s)?(?:(?:h[aá]rom)?negyed\s?|f[eé]l\s?)?[012]?\d\s?(?:\:\s?|\-?kor\s?|\-?t[oóöő]l|\-?ig|\-?r[ae]|[ oó]r[aá]\w{0,3}\s?)?(?:el[oöő]t+\s?|ut[aá]n\s?)?(?:[0123456]?\d[\-\s]?(?:kor|t[oóöő]l|ig|r[ae]| perc\w{0,3})?)?\,?\s?(?:ma\s?|holnap(?:\s?ut[aá]n)?\s?|tegnap(?:\s?el[oöő]t+)?\s?)?(?(1)( reggel\s?|hajnal(?:i|ban)?\s?|d[eé]lel[oöő]t+\s?|d\.?e\.?\s?|d[eé]lut[aá]n\s?|d\.?u\.?\s?|este\s?|[eé]j+el\s?) )?)' , re .IGNORECASE , self ._ntext_ if convert else self ._text_ )
685
685
if normalize :
686
686
results = []
687
- for item in matches :
687
+ for _item in matches :
688
+ item = _item [0 ]
688
689
if len (item .strip ())> 2 :
689
- item = ' ' + item + ' '
690
- hour = "00"
690
+ item = ' ' + item . lower () + ' '
691
+ hour = "00"
691
692
minute = "00"
692
693
pm = False
693
- zero = False
694
- elott = False
695
- hour_matches = _re .findall (r'\D([012]?\d(?!\d))\D*?(?!perc)(?:\:|\-?kor|[ oó]r[aá])?' , re .IGNORECASE , item )
696
- minute_matches = _re .findall (r'(?!negyed|f[eé]l)\D([0123456]?\d(?!\d))\D*?(?![oó]ra )(?:\-?kor|perc)?' , re .IGNORECASE , item )
697
- quarter_matches = _re .findall (r'((?:h[aá]rom)?negyed|f[eé]l)' , re .IGNORECASE , item )
694
+ zero = False
695
+ elott = False
696
+ hour_matches = _re .findall (r'\D([012]?\d(?!\d))\D*?(?!perc)(?:\:|\-?kor|\-?t[oóöő]l|\-?ig|\-?r[ae]|[ oó]r[aá]\w* )?' , re .IGNORECASE , item )
697
+ minute_matches = _re .findall (r'(?!negyed|f[eé]l)\D([0123456]?\d(?!\d))\D*?(?![oó]r[aá] )(?:\-?kor|\-?t[oóöő]l|\-?ig|\-?r[ae]| perc\w* )?' , re .IGNORECASE , item )
698
+ quarter_matches = _re .findall (r'((?:h[aá]rom)?negyed|f[eé]l)' , re .IGNORECASE , item )
698
699
am_matches = _re .findall (r'(reggel|hajnal|d[eé]lel[oöő]t|d\.?e\.?)' , re .IGNORECASE , item )
699
700
pm_matches = _re .findall (r'(d[eé]lut[aá]n|d\.?u\.?|este|[eé]j+el)' , re .IGNORECASE , item )
700
701
if len (hour_matches ) in (1 ,2 ):
@@ -763,15 +764,14 @@ def times(self,normalize=True,convert=True,current=-1):
763
764
if pm_matches :
764
765
pm = True
765
766
elif not am_matches :
766
- if current :
767
- if current >= 0 :
768
- now = current
769
- else :
770
- now = datetime .datetime .now ().hour
771
- if 'holnap' in item and hour < 9 :
772
- pm = True
773
- elif hour < 12 and now > hour :
774
- pm = True
767
+ if current is not False :
768
+ now = current
769
+ else :
770
+ now = datetime .datetime .now ().hour
771
+ if 'holnap' in item and hour < 9 :
772
+ pm = True
773
+ elif hour < 12 and now > hour :
774
+ pm = True
775
775
if pm and hour <= 12 :
776
776
hour += 12
777
777
hour %= 24
@@ -780,7 +780,7 @@ def times(self,normalize=True,convert=True,current=-1):
780
780
results .append (str (hour ).zfill (2 )+ ':' + str (minute ).zfill (2 ))
781
781
return results
782
782
else :
783
- return [item .strip () for item in matches if len (item .strip ())> 2 ]
783
+ return [item [ 0 ] .strip () for item in matches if len (item [ 0 ] .strip ())> 2 ]
784
784
return []
785
785
786
786
# extract list of time durations
@@ -795,13 +795,13 @@ def durations(self,normalize=True,convert=True):
795
795
val = 0
796
796
for sub_item in sub_matches :
797
797
match = sub_item .lower ().replace (',' ,'.' )
798
- sval = ''
798
+ sval = ''
799
799
for char in match :
800
800
if char .isdigit () or char == '.' :
801
801
sval += char
802
802
else :
803
803
break
804
- sval = float (sval )
804
+ sval = float (sval )
805
805
mpx = 1
806
806
if 'tized' in match or 'tízed' in match :
807
807
mpx = 0.1
@@ -953,11 +953,57 @@ def emails(self):
953
953
return _re .findall (r'\b([\w\d\-\_\.]+\@[\w\d\-\_\.]+\.\w{2,4}(?:\.\w{2,4})?)\b' , re .IGNORECASE , self .text )
954
954
return []
955
955
956
+ # extract relative dates like tomorrow or wednesday
957
+ def relative_dates (self ,normalize = True ,current = False ):
958
+ if self .text :
959
+ matches = _re .findall (r'\b((?:(?:meg)?el[oöő]z[oöő]|m[uú]lt|(?:r[aá])?k[oö]vetkez[oöő]|j[oö]v[oöő])?\s?(?:h[eé]t(?:i|en)?\s?)?(?:tegnap(?:el[oöő]t+)?|holnap(?:ut[aá]n)?|m[aá](?:i nap)?|h[eé]tf[oöő]|ked+|szerd[aá]|cs[uü]t[oö]rt[oö]k|p[eé]ntek|szo[nm]bat|vas[aá]rnap))(?:[aáeoö][dm])?(?:ig|r[ae]|t[oóöő]l|[aáeoöő]?t|[dkmnptv][ae][lk]|[aáeoö]?n)?\b' , re .IGNORECASE , self .text )
960
+ if normalize :
961
+ if current is not False :
962
+ _now = datetime .datetime .strptime (current ,"%Y-%m-%d" )
963
+ else :
964
+ _now = datetime .datetime .now ()
965
+ results = []
966
+ for item in matches :
967
+ item = item .lower ()
968
+ now = _now
969
+ if 'holnap' in item :
970
+ if 'ut' in item :
971
+ now += datetime .timedelta (days = 2 )
972
+ else :
973
+ now += datetime .timedelta (days = 1 )
974
+ elif 'tegnap' in item :
975
+ if 'el' in item :
976
+ now += datetime .timedelta (days = - 2 )
977
+ else :
978
+ now += datetime .timedelta (days = - 1 )
979
+ elif 'ma' not in item and 'má' not in item :
980
+ now -= datetime .timedelta (days = now .weekday ())
981
+ if _re .findall (r'((?:meg)?el[oöő]z[oöő]|m[uú]lt)' , re .IGNORECASE , item ):
982
+ now -= datetime .timedelta (weeks = 1 )
983
+ elif _re .findall (r'((?:r[aá])?k[oö]vetkez[oöő]|j[oö]v[oöő])' , re .IGNORECASE , item ):
984
+ now += datetime .timedelta (weeks = 1 )
985
+ if 'ked' in item :
986
+ now += datetime .timedelta (days = 1 )
987
+ elif 'szerd' in item :
988
+ now += datetime .timedelta (days = 2 )
989
+ elif _re .findall (r'cs[uü]t[oö]rt[oö]k' , re .IGNORECASE , item ):
990
+ now += datetime .timedelta (days = 3 )
991
+ elif _re .findall (r'p[eé]ntek' , re .IGNORECASE , item ):
992
+ now += datetime .timedelta (days = 4 )
993
+ elif _re .findall (r'szo[mn]bat' , re .IGNORECASE , item ):
994
+ now += datetime .timedelta (days = 5 )
995
+ elif _re .findall (r'vas[aá]rnap' , re .IGNORECASE , item ):
996
+ now += datetime .timedelta (days = 6 )
997
+ results .append (now .strftime ('%Y-%m-%d' ))
998
+ return results
999
+ else :
1000
+ return [item .strip () for item in matches ]
1001
+ return []
1002
+
956
1003
# Converts text representation of numbers to digits
957
1004
def _convert_numbers (self ,text ):
958
1005
if text :
959
- #fix = _re.sub(r'(?<=\d)\s+(?=\d)',re.IGNORECASE,'',text.lower())
960
- matches = _re .findall (r'((?:m[ií]n[uú]sz\s?|negat[ií]v\s?)?(?:(?:(?:(?:(?:t[ií]z|h[uú]sz|harminc)(?:[eo]n)?)?(?:nulla|egy|els[eoöő]|k[eé]t+[oöő]?|m[aá]sod(?:ik)?|h[aá]rom|harmadik|n[eé]gy|[oö]t|hat|h[eé]t|nyolc|kilenc)(?:v[ae]n)?)(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z)?\W*)|(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z|t[ií]z|h[uú]sz|harminc|nulla|z[eé]r[oó])\W*)+(?:[aeoö]dik)?(?:j?[aáeéi]+n?)?)\b' , re .IGNORECASE , text .lower ())
1006
+ matches = _re .findall (r'((?:m[ií]n[uú]sz\s?|negat[ií]v\s?)?(?:(?:(?:(?:(?:t[ií]z|h[uú]sz|harminc)(?:[eo]n)?)?(?:nulla|eg+y|els[eoöő]|k[eé]t+[oöő]?|m[aá]sod(?:ik)?|h[aá]rom|harmadik|n[eé]g+y|[oö]t|hat|h[eé]t|nyolc|kilenc)(?:v[ae]n)?)(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z)?\W*)|(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z|t[ií]z|h[uú]sz|harminc|nulla|z[eé]r[oó])\W*)+(?:[aeoö]dik)?(?:j?[aáeéi]+[gnt]?|[aáeéoöő]?t|kor|t[oóöő]l|r[ae]|[ckmrtvz]?[ae]l)?)\b' , re .IGNORECASE , text )
961
1007
results = {}
962
1008
for match in matches :
963
1009
value = 0
@@ -968,7 +1014,7 @@ def _convert_numbers(self,text):
968
1014
else :
969
1015
minus = 1
970
1016
minusm = ''
971
- parts = _re .findall (r'((?:(?:(?:(?:t[ií]z|h[uú]sz|harminc)(?:[eo]n)?)?(?:nulla|egy |els[eoöő]|k[eé]t+[oöő]?|m[aá]sod(?:ik)?|h[aá]rom|harmadik|n[eé]gy |[oö]t|hat|h[eé]t|nyolc|kilenc)(?:v[ae]n)?)(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z)?|(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z|t[ií]z|h[uú]sz|harminc|nulla|z[eé]r[oó]))\W*)' , re .IGNORECASE , match )
1017
+ parts = _re .findall (r'((?:(?:(?:(?:t[ií]z|h[uú]sz|harminc)(?:[eo]n)?)?(?:nulla|eg+y |els[eoöő]|k[eé]t+[oöő]?|m[aá]sod(?:ik)?|h[aá]rom|harmadik|n[eé]g+y |[oö]t|hat|h[eé]t|nyolc|kilenc)(?:v[ae]n)?)(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z)?|(?:milli[aá]rd|milli[oó]|ezer|sz[aá]z|t[ií]z|h[uú]sz|harminc|nulla|z[eé]r[oó]))\W*)' , re .IGNORECASE , match )
972
1018
values = []
973
1019
for part in parts :
974
1020
val = 0
@@ -1045,7 +1091,7 @@ def _convert_numbers(self,text):
1045
1091
1046
1092
swap = sorted (results .items (), key = lambda x : x [1 ], reverse = True )
1047
1093
for item in swap :
1048
- text = _re .sub (r'\b(' + re .escape (item [0 ])+ r')(?:[aeoö]dik?)?(?:j?[aáeéi]+n? )?\b' , re .IGNORECASE , str (item [1 ]), text )
1094
+ text = _re .sub (r'\b(' + re .escape (item [0 ])+ r')(?:[aeoö]dik?)?(?:j?[aáeéi]+[gnt]?|[aáeéoöő]?t|kor|t[oóöő]l|r[ae]|[ckmrtvz]?[ae]l )?\b' , re .IGNORECASE , str (item [1 ]), text )
1049
1095
return text
1050
1096
return ''
1051
1097
@@ -1063,9 +1109,9 @@ def _convert_numbers_helper(self,match,default):
1063
1109
return 2
1064
1110
elif _re .findall (r'(harmadik|h[aá]rom)' , re .IGNORECASE , match ):
1065
1111
return 3
1066
- elif _re .findall (r'n[eé]gy ' , re .IGNORECASE , match ):
1112
+ elif _re .findall (r'n[eé]g+y ' , re .IGNORECASE , match ):
1067
1113
return 4
1068
- elif 'egy' in match or 'els' in match :
1114
+ elif 'egy' in match or 'els' in match or 'eggy' in match :
1069
1115
return 1
1070
1116
elif _re .findall (r'[oö]t' , re .IGNORECASE , match ):
1071
1117
return 5
0 commit comments