Skip to content

Commit

Permalink
cyrillic support
Browse files Browse the repository at this point in the history
  • Loading branch information
rkcosmos committed Jul 28, 2020
1 parent c9755f9 commit 7a62644
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 15 deletions.
17 changes: 9 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,20 @@ Ready-to-use OCR with 40+ languages supported including Chinese, Japanese, Korea

## Supported Languages

We are currently supporting the following 48 languages.
We are currently supporting the following 54 languages.

Afrikaans (af), Azerbaijani (az), Bosnian (bs), Simplified Chinese (ch_sim),
Traditional Chinese (ch_tra), Czech (cs), Welsh (cy),
Afrikaans (af), Azerbaijani (az), Belarusian (be), Bulgarian (bg), Bosnian (bs),
Simplified Chinese (ch_sim), Traditional Chinese (ch_tra), Czech (cs), Welsh (cy),
Danish (da), German (de), English (en), Spanish (es), Estonian (et),
French (fr), Irish (ga), Hindi(hi), Croatian (hr), Hungarian (hu),
Indonesian (id), Icelandic (is), Italian (it), Japanese (ja), Korean (ko),
Kurdish (ku), Latin (la), Lithuanian (lt), Latvian (lv), Maori (mi),
Kurdish (ku), Latin (la), Lithuanian (lt), Latvian (lv), Maori (mi), Mongolian (mn),
Marathi (mr), Malay (ms), Maltese (mt), Nepali (ne), Dutch (nl), Norwegian (no),
Occitan (oc), Polish (pl), Portuguese (pt), Romanian (ro),
Serbian (latin)(rs_latin), Slovak (sk) (need revisit), Slovenian (sl),
Albanian (sq), Swedish (sv),Swahili (sw), Thai (th), Tagalog (tl),
Turkish (tr), Uzbek (uz), Vietnamese (vi) (need revisit)
Occitan (oc), Polish (pl), Portuguese (pt), Romanian (ro), Russian (ru),
Serbian (cyrillic)(rs_cyrillic), Serbian (latin)(rs_latin),
Slovak (sk) (need revisit), Slovenian (sl), Albanian (sq), Swedish (sv),
Swahili (sw), Thai (th), Tagalog (tl), Turkish (tr), Ukranian(uk), Uzbek (uz),
Vietnamese (vi) (need revisit)

List of characters is in folder [easyocr/character](https://github.com/JaidedAI/EasyOCR/tree/master/easyocr/character).
If you are native speaker of any language and think we should add or remove any character,
Expand Down
19 changes: 12 additions & 7 deletions easyocr/easyocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@
'hr','hu','id','is','it','ku','la','lt','lv','mi','ms','mt',\
'nl','no','oc','pl','pt','ro','rs_latin','sk','sl','sq',\
'sv','sw','tl','tr','uz','vi']
cyrillic_lang_list = ['ru','rs_cyrillic','be','bg','uk','mn']
devanagari_lang_list = ['hi','mr','ne']
all_lang_list = latin_lang_list + devanagari_lang_list + ['th','ch_sim','ch_tra','ja','ko']

all_lang_list = latin_lang_list + cyrillic_lang_list + devanagari_lang_list + ['th','ch_sim','ch_tra','ja','ko']
imgH = 64
input_channel = 1
output_channel = 512
Expand All @@ -51,6 +53,7 @@
'korean.pth': ('https://www.jaided.ai/read_download/korean.pth', '45b3300e0f04ce4d03dda9913b20c336'),
'thai.pth': ('https://www.jaided.ai/read_download/thai.pth', '40a06b563a2b3d7897e2d19df20dc709'),
'devanagari.pth': ('https://www.jaided.ai/read_download/devanagari.pth', 'db6b1f074fae3070f561675db908ac08'),
'cyrillic.pth': ('https://www.jaided.ai/read_download/cyrillic.pth', '5a046f7be2a4f7da6ed50740f487efa8'),
}

class Reader(object):
Expand Down Expand Up @@ -98,6 +101,10 @@ def __init__(self, lang_list, gpu=True):
self.model_lang = 'devanagari'
if set(lang_list) - set(devanagari_lang_list+['en']) != set():
raise ValueError('Devanagari is only compatible with English, try lang_list=["hi","mr","ne","en"]')
elif set(lang_list) & set(cyrillic_lang_list):
self.model_lang = 'cyrillic'
if set(lang_list) - set(cyrillic_lang_list+['en']) != set():
raise ValueError('Cyrillic is only compatible with English, try lang_list=["ru","rs_cyrillic","be","bg","uk","mn","en"]')
else: self.model_lang = 'latin'

separator_list = {}
Expand All @@ -106,44 +113,42 @@ def __init__(self, lang_list, gpu=True):
'ÀÁÂÃÄÅÆÇÈÉÊËÍÎÑÒÓÔÕÖØÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿąęĮįıŁłŒœŠšųŽž'
self.character = number+ symbol + all_char
model_file = 'latin.pth'

elif self.model_lang == 'cyrillic':
cyrillic_char = 'ЁЂЄІЇЈЉЊЋЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюяёђєіїјљњћўџҐґҮүө'
self.character = number+ symbol + en_char + cyrillic_char
model_file = 'cyrillic.pth'
elif self.model_lang == 'devanagari':
devanagari_char = '.ँंःअअंअःआइईउऊऋएऐऑओऔकखगघङचछजझञटठडढणतथदधनऩपफबभमयरऱलळवशषसह़ािीुूृॅेैॉोौ्ॐ॒क़ख़ग़ज़ड़ढ़फ़ॠ।०१२३४५६७८९॰'
self.character = number+ symbol + en_char + devanagari_char
model_file = 'devanagari.pth'

elif self.model_lang == 'chinese_tra':
char_file = os.path.join(BASE_PATH, 'character', "ch_tra_char.txt")
with open(char_file, "r", encoding = "utf-8-sig") as input_file:
ch_tra_list = input_file.read().splitlines()
ch_tra_char = ''.join(ch_tra_list)
self.character = number + symbol + en_char + ch_tra_char
model_file = 'chinese.pth'

elif self.model_lang == 'chinese_sim':
char_file = os.path.join(BASE_PATH, 'character', "ch_sim_char.txt")
with open(char_file, "r", encoding = "utf-8-sig") as input_file:
ch_sim_list = input_file.read().splitlines()
ch_sim_char = ''.join(ch_sim_list)
self.character = number + symbol + en_char + ch_sim_char
model_file = 'chinese_sim.pth'

elif self.model_lang == 'japanese':
char_file = os.path.join(BASE_PATH, 'character', "ja_char.txt")
with open(char_file, "r", encoding = "utf-8-sig") as input_file:
ja_list = input_file.read().splitlines()
ja_char = ''.join(ja_list)
self.character = number + symbol + en_char + ja_char
model_file = 'japanese.pth'

elif self.model_lang == 'korean':
char_file = os.path.join(BASE_PATH, 'character', "ko_char.txt")
with open(char_file, "r", encoding = "utf-8-sig") as input_file:
ko_list = input_file.read().splitlines()
ko_char = ''.join(ko_list)
self.character = number + symbol + en_char + ko_char
model_file = 'korean.pth'

elif self.model_lang == 'thai':
separator_list = {
'th': ['\xa2', '\xa3'],
Expand Down

0 comments on commit 7a62644

Please sign in to comment.