Skip to content

Commit 80b6b85

Browse files
committed
updated inverse stemmer for articles
- updatd inverse stememr for articles - nlp now has a function for generating "a" or "az" - updated example_Stememr_3.py to showcase this - parser Intents now matches typo stems with accidental "ű" letters pressed at the end - improved entities and added additional commands and small_talk topics - increased version number to 1.1.11
1 parent 6452d61 commit 80b6b85

File tree

6 files changed

+77
-16
lines changed

6 files changed

+77
-16
lines changed

examples/example_stemmer_3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
parts = query.split('-')
1313
artist = stemmer.inverse(parts[0],'től') # "tól" and "től" are both valid
1414
title = stemmer.inverse(parts[1],'t')
15-
the = ('az' if nlp.vowel_beginning(title) else 'a')
15+
the = nlp.az(title)
1616

1717
print('A zenelejátszó program az alábbi számot játssza:')
1818
print(artist,the,title)

lara/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Lara - Lingusitic Aim Recognizer API
44

55
__all__ = 'nlp','parser','stemmer','entities'
6-
__version__ = '1.1.10'
6+
__version__ = '1.1.11'
77
__version_info__ = tuple(int(num) for num in __version__.split('.'))
88

99
import sys

lara/entities.py

Lines changed: 13 additions & 10 deletions
Large diffs are not rendered by default.

lara/nlp.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def remove_double_letters(text, replace=''):
2424
return replace.join([text[i] for i in range(len(text)-1) if text[i+1]!= text[i]]+[text[-1]])
2525
return ''
2626

27-
def remove_space_between_numbers(text, replace=''):
27+
def remove_spaces_between_numbers(text, replace=''):
2828
if text:
2929
return re.sub(r'(?<=\d)[\s\\\-/]+(?=\d)', replace, text)
3030
return ''
@@ -363,3 +363,23 @@ def ngram(tokens,n=2):
363363
grams = [tokens[i:i+n] for i in range(len(tokens)-n+1)]
364364
return [' '.join(item) for item in grams]
365365
return []
366+
367+
# a or az
368+
def az(word):
369+
word = trim(word)
370+
if word:
371+
if vowel_beginning(word):
372+
return 'az'
373+
if word[0] == '5':
374+
return 'az'
375+
if word[0] == '1':
376+
number = ''
377+
for char in word:
378+
if char.isnumeric():
379+
number += char
380+
else:
381+
if char != ' ':
382+
break
383+
if len(number) in (1,4,7,10):
384+
return 'az'
385+
return 'a'

lara/parser.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ class Intents:
1111
prefixes = r'(?:(?i)'+('|'.join(["abba","alá","át","be","bele","benn","el","ellen","elő","fel","föl","hátra","hozzá","ide","ki","körül","le","meg","mellé","neki","oda","össze","rá","szét","túl","utána","vissza"]))+')?'
1212
typo_prefixes = r'(?:(?i)'+('|'.join(["aba","ala","at","be","bele","ble","ben","el","elen","eln","elo","fel","fol","hatra","htara","harta","hoza","hzoa","ide","ki","korul","kroul","kourl","le","meg","mele","mle","neki","nkei","oda","osze","ozse","ra","szet","sezt","tul","utana","uatna","utna","visza","vsiza","vizsa"]))+')?'
1313
pattern_noun = r'(?i)a?i?n?(?:[aáeéioóöőuúü]?[djknmrst])?(?:[abjhkntv]?[aáeéioóöőuúü]?[lgkntz]?)?(?:[ae][kt])?'
14-
typo_pattern_noun = r'(?i)a?i?n?(?:[aeiou]?[djknmrst])?(?:[abjhkntv]?[aeiou]?[lgkntz]?)?(?:[ae][kt])?'
14+
typo_pattern_noun = r'(?i)a?i?n?(?:[aeiou]?[djknmrst])?(?:[abjhkntv]?[aeiou]?[lgkntz]?)?(?:[ae][kt])?u?'
1515
pattern_adj = r'(?i)(?:[aeoóöő]?s)?(?:[aáeéoó]?b{0,2})(?:[ae]?[nk])?(?:j?[ae])?(?:(?:[aáeéioóöőuúü]?[dklmnt])?(?:[aáeéioóöőuúü]?[klnt]?)?)(?:s[aáeé]g[ae]?(?:i\w*)?)?'
16-
typo_pattern_adj = r'(?i)(?:[aeo]?s)?(?:[aeo]?b?)(?:[ae]?[nk])?(?:j?[ae])?(?:(?:[aeiou]?[dklmnt])?(?:[aeiou]?[klnt]?)?)(?:s[ae]g[ae]?(?:i\w*)?)?'
16+
typo_pattern_adj = r'(?i)(?:[aeo]?s)?(?:[aeo]?b?)(?:[ae]?[nk])?(?:j?[ae])?(?:(?:[aeiou]?[dklmnt])?(?:[aeiou]?[klnt]?)?)(?:s[ae]g[ae]?(?:i\w*)?)?u?'
1717
pattern_verb = r'(?i)(?:h[ae][st]+e?)?(?:j?[ae])?(?:[eaá]?s{0,2}e?d?|[aáeéo]tt)?(?:(?:[jntv]|[eo]?g[ae]t+)?(?:[aeioöuü]n?[dklmt]|n[aáeéi]k?|sz|[aái])?(?:t[aáeéou][dkmt]?(?:ok)?)?)?(?:(?:t[ae]t)?(?:h[ae]t(?:[jnt]?[aáeéou](?:[dkm]|t[eéo]k)?)?t*)|[aáeé]?z?ni)?'
18-
typo_pattern_verb = r'(?i)(?:h[ae][st]e?)?(?:j?[ae])?(?:[eaá]?s?e?d?|[aeo]t)?(?:(?:[jntv]|[eo]?g[ae]t)?(?:[aeiou]n?[dklmt]|n[aei]k?|sz|[ai])?(?:t[aeou][dkmt]?(?:ok)?)?)?(?:(?:t[ae]t)?(?:h[ae]t(?:[jnt]?[aeou](?:[dkm]|t[eo]k)?)?t?)|[ae]?z?ni)?'
18+
typo_pattern_verb = r'(?i)(?:h[ae][st]e?)?(?:j?[ae])?(?:[eaá]?s?e?d?|[aeo]t)?(?:(?:[jntv]|[eo]?g[ae]t)?(?:[aeiou]n?[dklmt]|n[aei]k?|sz|[ai])?(?:t[aeou][dkmt]?(?:ok)?)?)?(?:(?:t[ae]t)?(?:h[ae]t(?:[jnt]?[aeou](?:[dkm]|t[eo]k)?)?t?)|[ae]?z?ni)?u?'
1919

2020
##### CONSTRUCTOR #####
2121
def __init__(self, new_intents={}, is_raw=False):

lara/stemmer.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,27 +343,49 @@ def inverse(word,affix):
343343
if not result[-1].isalnum():
344344
result = result+"-"
345345
if affix in ('ra','re'):
346+
if word in ('a','az'):
347+
return 'arra'
348+
if word=='ez':
349+
return 'erre'
346350
if word[-1].lower() in ('a','e'):
347351
result = result[:-1]+result[-1].replace('a','á').replace('e','é')
348352
if vh == 'magas':
349353
return result+'re'
350354
else:
351355
return result+'ra'
352356
if affix in ('ba','be'):
357+
if word in ('a','az'):
358+
return 'abba'
359+
if word=='ez':
360+
return 'ebbe'
353361
if word[-1].lower() in ('a','e'):
354362
result = result[:-1]+result[-1].replace('a','á').replace('e','é')
355363
if vh == 'magas':
356364
return result+'be'
357365
else:
358366
return result+'ba'
359367
if affix in ('ban','ben'):
368+
if word in ('a','az'):
369+
return 'abban'
370+
if word=='ez':
371+
return 'ebben'
360372
if word[-1].lower() in ('a','e'):
361373
result = result[:-1]+result[-1].replace('a','á').replace('e','é')
362374
if vh == 'magas':
363375
return result+'ben'
364376
else:
365377
return result+'ban'
366378
if affix in ('k','s','t'):
379+
if word in ('a','az'):
380+
if affix=='k':
381+
return 'azok'
382+
if affix=='t':
383+
return 'azt'
384+
if word=='ez':
385+
if affix=='k':
386+
return 'ezek'
387+
if affix=='t':
388+
return 'ezt'
367389
if lara.nlp.is_vowel(word[-1]):
368390
if word[-1].lower() in ('a','e'):
369391
result = result[:-1]+result[-1].replace('a','á').replace('e','é')
@@ -399,12 +421,20 @@ def inverse(word,affix):
399421
return result
400422
return result+'i'
401423
if affix in ('bol','ból','böl','ből','rol','ról','röl','ről','tol','tól','töl','től'):
424+
if word in ('a','az'):
425+
return 'a'+affix[0]+affix[0]+'ól'
426+
if word=='ez':
427+
return 'e'+affix[0]+affix[0]+'ől'
402428
if word[-1].lower() in ('a','e'):
403429
result = result[:-1]+result[-1].replace('a','á').replace('e','é')
404430
if vh == 'magas':
405431
return result+affix[0]+'ől'
406432
return result+affix[0]+'ól'
407433
if affix in ('nak','nek'):
434+
if word in ('a','az'):
435+
return 'annak'
436+
if word == 'ez':
437+
return 'ennek'
408438
if word[-1].lower() in ('a','e'):
409439
result = result[:-1]+result[-1].replace('a','á').replace('e','é')
410440
if vh in 'magas':
@@ -415,6 +445,10 @@ def inverse(word,affix):
415445
return result+'nek'
416446
return result+'nak'
417447
if affix in ('val','vel'):
448+
if word in ('a','az'):
449+
return 'azzal'
450+
if word == 'ez':
451+
return 'ezzel'
418452
if lara.nlp.is_vowel(word[-1]):
419453
if word[-1].lower() in ('a','e'):
420454
result = result[:-1]+result[-1].replace('a','á').replace('e','é')
@@ -449,6 +483,10 @@ def inverse(word,affix):
449483
else:
450484
return result+'al'
451485
if affix in ('on','en','ön'):
486+
if word in ('a','az'):
487+
return 'azon'
488+
if word == 'ez':
489+
return 'ezen'
452490
if len(result)==2:
453491
if word.lower()=="fű":
454492
return "füvön"

0 commit comments

Comments
 (0)