Skip to content

Commit c12275c

Browse files
committed
updated inverse stemmer
- updated inverse stemmer for better results - added example for inverse stemmer in example_stemmer_3 - added example_stemmer_3 to test_examples - fixed minor bugs - increased version number to 1.1.3
1 parent de8a4a9 commit c12275c

File tree

5 files changed

+82
-40
lines changed

5 files changed

+82
-40
lines changed

examples/example_stemmer_2.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@
1414

1515
search = ' '.join(valid)
1616

17-
print("Keresés erre:",search)
17+
print("Keresés erre:",search)
18+

examples/example_stemmer_3.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# -*- coding: UTF-8 -*-
2+
3+
import os.path, sys
4+
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))
5+
from lara import stemmer, nlp
6+
7+
''' Stemmer and n-gram example '''
8+
9+
if __name__ == "__main__":
10+
query = "Toto - Afrika"
11+
12+
parts = query.split('-')
13+
artist = stemmer.inverse(parts[0],'től') # "tól" and "től" are both valid
14+
title = stemmer.inverse(parts[1],'t')
15+
the = ('az' if nlp.vowel_beginning(title) else 'a')
16+
17+
print('A zenelejátszó program az alábbi számot játssza:')
18+
print(artist,the,title)
19+

lara/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Lara - Lingusitic Aim Recognizer API
44

55
__all__ = 'nlp','parser','stemmer','entities'
6-
__version__ = '1.1.2'
6+
__version__ = '1.1.3'
77
__version_info__ = tuple(int(num) for num in __version__.split('.'))
88

99
import sys

lara/stemmer.py

Lines changed: 59 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -323,59 +323,63 @@ def just_asking(text):
323323

324324
# add affixes to words based on their vowel harmony
325325
def inverse(word,affix):
326+
word = lara.nlp.trim(word)
326327
if not word:
327328
return ''
328329
vh = lara.nlp.vowel_harmony(word)
329330
result = word
330331
if affix in ('ra','re'):
332+
if word[-1].lower() in ('a','e'):
333+
result = result[:-1]+result[-1].replace('a','á').replace('e','é')
331334
if vh == 'magas':
332-
return result+'ra'
333-
else:
334335
return result+'re'
335-
if affix == 't':
336-
if lara.nlp.is_vowel(word[-1]):
337-
if word[-1].lower() in ('a','e'):
338-
result = result[:-1]+result[-1].replace('a','á').replace('e','é')
339-
return result+'t'
340-
elif vh == 'magas':
341-
return result+'et'
342336
else:
343-
return result+'at'
344-
if affix == 'k':
337+
return result+'ra'
338+
if affix in ('k','s','t'):
345339
if lara.nlp.is_vowel(word[-1]):
346340
if word[-1].lower() in ('a','e'):
347341
result = result[:-1]+result[-1].replace('a','á').replace('e','é')
348-
return result+'k'
349-
elif vh == 'magas':
350-
return result+'ek'
342+
if len(result)==2:
343+
if word.lower()=="fű":
344+
return result[0]+"üve"+affix
345+
elif word.lower()=="tó":
346+
return result[0]+"ava"+affix
347+
elif word.lower()=="ló":
348+
return result[0]+"ova"+affix
349+
return result+affix
350+
test = _inverse_only_o(result) # exceptions
351+
if test:
352+
test2 = sum([lara.nlp.is_vowel(char) for char in result]) # more exceptions
353+
if test2<2:
354+
return result+test+affix
355+
return result[:-2]+result[-1]+test+affix
356+
if vh == 'magas':
357+
return result+'e'+affix
358+
elif vh == 'vegyes':
359+
return result+'o'+affix
351360
else:
352-
return result+'ak'
361+
return result+'a'+affix
353362
if affix == 'i':
363+
if len(result)==2:
364+
if word.lower()=="fű":
365+
return result[0]+"üvi"
366+
elif word.lower()=="tó":
367+
return result[0]+"avi"
368+
elif word.lower()=="ló":
369+
return result[0]+"ovi"
354370
if word[-1]=='i':
355371
return result
356372
return result+'i'
357-
if affix in ('bol','ból','böl','ből'):
358-
if word[-1].lower() in ('a','e'):
359-
result = result[:-1]+result[-1].replace('a','á').replace('e','é')
360-
if vh in ('magas','vegye'):
361-
return result+'ből'
362-
return result+'ból'
363-
if affix in ('rol','ról','röl','ről'):
364-
if word[-1].lower() in ('a','e'):
365-
result = result[:-1]+result[-1].replace('a','á').replace('e','é')
366-
if vh in ('magas','vegyes'):
367-
return result+'ről'
368-
return result+'ról'
369-
if affix in ('tol','tól','töl','től'):
373+
if affix in ('bol','ból','böl','ből','rol','ról','röl','ről','tol','tól','töl','től'):
370374
if word[-1].lower() in ('a','e'):
371375
result = result[:-1]+result[-1].replace('a','á').replace('e','é')
372-
if vh in ('magas','vegyes'):
373-
return result+'től'
374-
return result+'tól'
376+
if vh == 'magas':
377+
return result+affix[0]+'ől'
378+
return result+affix[0]+'ól'
375379
if affix in ('nak','nek'):
376380
if word[-1].lower() in ('a','e'):
377381
result = result[:-1]+result[-1].replace('a','á').replace('e','é')
378-
if vh in ('magas','vegyes'):
382+
if vh == 'magas':
379383
return result+'nek'
380384
return result+'nak'
381385
if affix in ('val','vel'):
@@ -394,15 +398,33 @@ def inverse(word,affix):
394398
else:
395399
if len(word)>1:
396400
if word[-2:].lower() in ('cs','gy','ly','ny','sz','ty','zs'):
397-
result = result[:-2]+result[-2]+result[-2]+result[-1]
401+
if word[-3].lower()!=word[-2].lower():
402+
result = result[:-2]+result[-2]+result[-2]+result[-1]
403+
else:
404+
result = result[:-2]+result[-2]+result[-1]
398405
if vh == 'magas':
399406
return result+'el'
400407
else:
401408
return result+'al'
402-
if vh == 'magas':
403-
return result+result[-1]+'el'
409+
if word[-2].lower()!=word[-1].lower():
410+
if vh == 'magas':
411+
return result+result[-1]+'el'
412+
else:
413+
return result+result[-1]+'al'
404414
else:
405-
return result+result[-1]+'al'
415+
if vh == 'magas':
416+
return result+'el'
417+
else:
418+
return result+'al'
406419
raise ValueError('Unsupported affix',affix)
407420

408-
421+
def _inverse_only_o(word):
422+
vowel = ''
423+
for char in word:
424+
if lara.nlp.is_vowel(char):
425+
if char not in ('o','O','ó','Ó','ö','Ö','ő','Ő'):
426+
return False
427+
else:
428+
vowel = char
429+
return vowel.lower()
430+

tests/test_examples.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import os, errno, subprocess
55

66
@pytest.mark.parametrize("entity", [
7-
"example_chatbot_1","example_chatbot_2","example_chatbot_3","example_chatbot_4","example_chatbot_5","example_chatbot_6","example_chatbot_7","example_huszt","example_news","example_wiki_intents","example_stemmer_1","example_Stemmer_2","example_tweet","example_entities"
7+
"example_chatbot_1","example_chatbot_2","example_chatbot_3","example_chatbot_4","example_chatbot_5","example_chatbot_6","example_chatbot_7","example_huszt","example_news","example_wiki_intents","example_stemmer_1","example_stemmer_2","example_stemmer_3","example_tweet","example_entities"
88
])
99
def test_entities(entity):
1010
file = os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir, 'examples/', entity+'.py')

0 commit comments

Comments
 (0)