Skip to content

Commit 1b15a81

Browse files
committed
updated parser Extract dates()
- updated parser Extract dates() based on user feedback - added test cases for update - updated README.md to link to StellA - increased version number to 1.2.1
1 parent 45920ed commit 1b15a81

File tree

4 files changed

+35
-13
lines changed

4 files changed

+35
-13
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ Instead of being an all purpose NLP tool, **Lara** was created to fit the [quirk
1414

1515
## About Lara
1616

17-
Here is a short list of things you can easily do with **Lara** in Hungarian. For full documentation and further examples, **CHECK OUT [THE WIKI](https://github.com/sedthh/lara-hungarian-nlp/wiki)**.
17+
Here is a short list of things you can easily do with **Lara** in Hungarian. For full documentation and further examples, **CHECK OUT [THE WIKI](https://github.com/sedthh/lara-hungarian-nlp/wiki)**. A complete case study on [how to make ChatBots and Virtual Assistants](https://chatbotsmagazine.com/a-complete-case-study-for-developing-smart-assistants-79316be80e89) in foreign languages is also available.
18+
1819

1920
#### Find intents
2021

lara/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Lara - Lingusitic Aim Recognizer API
44

55
__all__ = 'nlp','parser','stemmer','entities'
6-
__version__ = '1.2.0'
6+
__version__ = '1.2.1'
77
__version_info__ = tuple(int(num) for num in __version__.split('.'))
88

99
import lara.nlp

lara/parser.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -212,10 +212,9 @@ def _generate(self, item):
212212
item['pattern'] = r''+item['stem']+item['affix']
213213
item['typo_pattern'] = r''+item['typo_stem']+item['typo_affix']
214214
else:
215-
216-
item['pattern'] = r'(?:'+re.escape(item['stem'])+r'{1,2}'+item['affix']+r')'
215+
item['pattern'] = r'(?:'+re.escape(item['stem'])+r'{1,2}'+item['affix']+r')'
217216
scramble = self._scramble(item['typo_stem'], (item['wordclass'] == 'adjective'))
218-
item['typo_pattern']= r'(?:'+scramble+item['typo_affix']+')'
217+
item['typo_pattern'] = r'(?:'+scramble+item['typo_affix']+')'
219218
if not item['ignorecase']:
220219
item['pattern'] = r'(?s)'+item['pattern']
221220
item['typo_pattern'] = r'(?s)'+item['typo_pattern']
@@ -232,7 +231,7 @@ def _generate(self, item):
232231

233232
item['pattern'] = item['prefix']+item['pattern']
234233
item['typo_pattern'] = item['typo_prefix']+item['typo_pattern']
235-
234+
236235
return item
237236

238237
# generate scrambled keywords
@@ -612,10 +611,13 @@ def phone_numbers(self,normalize=True,convert=True):
612611
return results
613612

614613
# extract list of common Hungarian date formats from text without further processing them
615-
def dates(self,normalize=True,convert=True):
614+
def dates(self,normalize=True,convert=True,current=False):
616615
results = []
617616
if self.text:
618-
now = datetime.datetime.now()
617+
if current:
618+
now = datetime.datetime.strptime(current,"%Y-%m-%d")
619+
else:
620+
now = datetime.datetime.now()
619621
matches = _re.findall(r'((\d{2})?(\d{2}([\\\/\.\-]\s?|\s))([eé]v\s?)?(\d{1,2}([\\\/\.\-]\s?|\s)(h[oó](nap)?\s?)?)?(\d{1,2}))\W*([aáeéio][ikn]|nap)?\b', re.IGNORECASE, self.text)
620622
for item in matches:
621623
match = re.sub('([eé]v|h[oó]|nap)', '', item[0])
@@ -639,7 +641,7 @@ def dates(self,normalize=True,convert=True):
639641
results.append(str(now.year)+'-'+parts[0].zfill(2)+'-'+parts[1].zfill(2))
640642
else:
641643
results.append(item[0])
642-
matches = _re.findall(r'((\d{2}(\d{2})?\W{1,2})?((jan|feb|m[aá]r|[aá]pr|m[aá]j|j[uú][nl]|aug|sz?ep|okt|nov|dec)\w{0,10}\W{1,2}|[ivx]{1,4}\W{0,2})(h[aoó][nv]?\w{0,7}\W{1,2})?(\d{1,2})?\W?\w*)\b', re.IGNORECASE, self.ntext if convert else self.text)
644+
matches = _re.findall(r'\b((\d{2}(\d{2})?\W{1,2})?((jan|feb|m[aá]r|[aá]pr|m[aá]j|j[uú][nl]|aug|sz?ep|okt|nov|dec)\w{0,10}\W{1,2}|[ivx]{1,4}\W{0,2})(h[aoó][nv]?\w{0,7}\W{1,2})?(\d{1,2})?\W?\w*)\b', re.IGNORECASE, self.ntext if convert else self.text)
643645
for item in matches:
644646
match = item[0].lower()
645647
year = ''
@@ -713,11 +715,22 @@ def dates(self,normalize=True,convert=True):
713715
month = str(11-len(roman)).zfill(2)
714716
else:
715717
month = str(len(roman)).zfill(2)
716-
if month and month!='00':
718+
if month and month!='00' and len(day)<=2:
717719
if normalize:
718720
results.append(year+'-'+month+'-'+day)
719721
else:
720722
results.append(item[0])
723+
if not results:
724+
matches = _re.findall(r'\b(?<!\-)([0123]?\d)[\.\-aáeéint]+(?![kloópr])', re.IGNORECASE, self.ntext if convert else self.text)
725+
for item in matches:
726+
if int(item)<=31:
727+
if normalize:
728+
year = str(now.year)
729+
month = str(now.month).zfill(2)
730+
day = item.zfill(2)
731+
results.append(year+'-'+month+'-'+day)
732+
else:
733+
results.append(item)
721734
return results
722735

723736
# extract times like 12:00 or délután 4

tests/test_parser.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -636,23 +636,31 @@ def test_parser_extract_convert_numbers(info):
636636
{
637637
"text" : "Hívj fel ezen a számon 2018 IV. huszadikán mondjuk délután nyolc perccel háromnegyed kettő előtt!",
638638
"function" : "dates",
639-
"args" : [True],
639+
"args" : [True,True,'2018-04-01'],
640640
"result" : ["2018-04-20"]
641641
}
642642
),
643643
(
644644
{
645645
"text" : "18/01/09 vagy 18-01-09 vagy 2018. 01. 09. vagy 2018. 01. 09-én vagy 2018 VII 20. és így 2018 január 20-án",
646646
"function" : "dates",
647-
"args" : [False],
647+
"args" : [False,True,'2018-04-01'],
648648
"result" : ["18/01/09","18-01-09","2018. 01. 09","2018. 01. 09","2018 VII 20","2018 január 20-án"]
649649
}
650650
),
651+
(
652+
{
653+
"text" : "találkozzunk 20-án valamikor vagy 21-én?",
654+
"function" : "dates",
655+
"args" : [True,True,'2018-04-01'],
656+
"result" : ["2018-04-20","2018-04-21"]
657+
}
658+
),
651659
(
652660
{
653661
"text" : "18/01/09 vagy 18-01-09 vagy 2019. 01. 09. vagy 2018. 01. 09-én vagy 2018 VII 20. és így 2018 január 20-án",
654662
"function" : "dates",
655-
"args" : [True],
663+
"args" : [True,True,'2018-04-01'],
656664
"result" : ["2018-01-09","2018-01-09","2019-01-09","2018-01-09","2018-07-20","2018-01-20"]
657665
}
658666
),

0 commit comments

Comments
 (0)