diff --git a/.gitignore b/.gitignore index 4622249116..b7cd83cc93 100644 --- a/.gitignore +++ b/.gitignore @@ -73,6 +73,7 @@ build/* *.a *.class *.jar +__pycache__ # tessdata *.traineddata diff --git a/src/training/language_specific.py b/src/training/language_specific.py new file mode 100644 index 0000000000..76803717d2 --- /dev/null +++ b/src/training/language_specific.py @@ -0,0 +1,1294 @@ +#!/usr/bin/env python3 +# (C) Copyright 2014, Google Inc. +# (C) Copyright 2018, James R Barlow +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Set some language specific variables. Works in conjunction with +# tesstrain.sh +# + +#============================================================================= +# Language specific info +#============================================================================= + +import os +import logging +log = logging.getLogger(__name__) + +# Array of all valid language codes. +VALID_LANGUAGE_CODES=("afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat " + "ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo " + "ell eng enm epo est eus fas fil fin fra frk frm gle glg " + "grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old " + "jav jav_java jpn kan kat kat_old kaz khm kir kor kur lao lat " + "lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori " + "pan pol por pus ron rus san sin slk slv snd spa spa_old " + "sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur " + "uig ukr urd uzb uzb_cyrl vie yid gle_uncial ") + +# Codes for which we have webtext but no fonts: +UNUSABLE_LANGUAGE_CODES="" + +FRAKTUR_FONTS=[ + "CaslonishFraxx Medium", + "Cloister Black, Light", + "Proclamate Light", + "UnifrakturMaguntia", + "Walbaum-Fraktur", +] + +# List of fonts to train on +LATIN_FONTS=[ + "Arial Bold", + "Arial Bold Italic", + "Arial Italic", + "Arial", + "Courier New Bold", + "Courier New Bold Italic", + "Courier New Italic", + "Courier New", + "Times New Roman, Bold", + "Times New Roman, Bold Italic", + "Times New Roman, Italic", + "Times New Roman,", + "Georgia Bold", + "Georgia Italic", + "Georgia", + "Georgia Bold Italic", + "Trebuchet MS Bold", + "Trebuchet MS Bold Italic", + "Trebuchet MS Italic", + "Trebuchet MS", + "Verdana Bold", + "Verdana Italic", + "Verdana", + "Verdana Bold Italic", + "URW Bookman L Bold", + "URW Bookman L Italic", + "URW Bookman L Bold Italic", + "Century Schoolbook L Bold", + "Century Schoolbook L Italic", + "Century Schoolbook L Bold Italic", + "Century Schoolbook L Medium", + "DejaVu Sans Ultra-Light", +] + +# List of fonts for printed/neo-Latin ('lat' language code, different from Latin script) +NEOLATIN_FONTS=[ + "GFS Bodoni", + "GFS Bodoni Bold", + "GFS Bodoni Italic", + "GFS Bodoni Bold Italic", + "GFS Didot", + "GFS Didot Bold", + "GFS Didot Italic", + "GFS Didot Bold Italic", + "Cardo", + "Cardo Bold", + "Cardo Italic", + "Wyld", + "Wyld Italic", + "EB Garamond", + "EB Garamond Italic", + "Junicode", + "Junicode Bold", + "Junicode Italic", + "Junicode Bold Italic", + "IM FELL DW Pica PRO", + "IM FELL English PRO", + "IM FELL Double Pica PRO", + "IM FELL French Canon PRO", + "IM FELL Great Primer PRO", + "IM FELL DW Pica PRO Italic", + "IM FELL English PRO Italic", + "IM FELL Double Pica PRO Italic", + "IM FELL French Canon PRO Italic", + "IM FELL Great Primer PRO Italic", +] + +IRISH_UNCIAL_FONTS=[ + "Bunchlo Arsa Dubh GC", + "Bunchlo Arsa GC", + "Bunchlo Arsa GC Bold", + "Bunchlo Dubh GC", + "Bunchlo GC", + "Bunchlo GC Bold", + "Bunchlo Nua GC Bold", + "Bunchló na Nod GC", + "Gadelica", + "Glanchlo Dubh GC", + "Glanchlo GC", + "Glanchlo GC Bold", + "Seanchló Dubh GC", + "Seanchló GC", + "Seanchló GC Bold", + "Seanchló na Nod GC", + "Seanchló Ársa Dubh GC", + "Seanchló Ársa GC", + "Seanchló Ársa GC Bold", + "Tromchlo Beag GC", + "Tromchlo Mor GC", + "Urchlo GC", + "Urchlo GC Bold", +] + +EARLY_LATIN_FONTS=[ + *FRAKTUR_FONTS, + *LATIN_FONTS, + # The Wyld font family renders early modern ligatures encoded in the private + # unicode area. + "Wyld", + "Wyld Italic", + # Fonts that render the Yogh symbol (U+021C, U+021D) found in Old English. + "GentiumAlt", +] + +VIETNAMESE_FONTS=[ + "Arial Unicode MS Bold", + "Arial Bold Italic", + "Arial Italic", + "Arial Unicode MS", + "FreeMono Bold", + "Courier New Bold Italic", + "FreeMono Italic", + "FreeMono", + "GentiumAlt Italic", + "GentiumAlt", + "Palatino Linotype Bold", + "Palatino Linotype Bold Italic", + "Palatino Linotype Italic", + "Palatino Linotype", + "Really No 2 LT W2G Light", + "Really No 2 LT W2G Light Italic", + "Really No 2 LT W2G Medium", + "Really No 2 LT W2G Medium Italic", + "Really No 2 LT W2G Semi-Bold", + "Really No 2 LT W2G Semi-Bold Italic", + "Really No 2 LT W2G Ultra-Bold", + "Really No 2 LT W2G Ultra-Bold Italic", + "Times New Roman, Bold", + "Times New Roman, Bold Italic", + "Times New Roman, Italic", + "Times New Roman,", + "Verdana Bold", + "Verdana Italic", + "Verdana", + "Verdana Bold Italic", + "VL Gothic", + "VL PGothic", +] + +DEVANAGARI_FONTS=[ + "FreeSans", + "Chandas", + "Kalimati", + "Uttara", + "Lucida Sans", + "gargi Medium", + "Lohit Devanagari", + "Arial Unicode MS Bold", + "Ascender Uni", + "Noto Sans Devanagari Bold", + "Noto Sans Devanagari", + "Samyak Devanagari Medium", + "Sarai", + "Saral LT Bold", + "Saral LT Light", + "Nakula", + "Sahadeva", + "Samanata", + "Santipur OT Medium", +] + +KANNADA_FONTS=[ + "Kedage Bold", + "Kedage Italic", + "Kedage", + "Kedage Bold Italic", + "Mallige Bold", + "Mallige Italic", + "Mallige", + "Mallige Bold Italic", + "Arial Unicode MS", + "Arial Unicode MS Bold", + "Ascender Uni", + "cheluvi Medium", + "Noto Sans Kannada Bold", + "Noto Sans Kannada", + "Lohit Kannada", + "Tunga", + "Tunga Bold", +] + +TELUGU_FONTS=[ + "Pothana2000", + "Vemana2000", + "Lohit Telugu", + "Arial Unicode MS Bold", + "Ascender Uni", + "Dhurjati", + "Gautami Bold", + "Gidugu", + "Gurajada", + "Lakki Reddy", + "Mallanna", + "Mandali", + "NATS", + "NTR", + "Noto Sans Telugu Bold", + "Noto Sans Telugu", + "Peddana", + "Ponnala", + "Ramabhadra", + "Ravi Prakash", + "Sree Krushnadevaraya", + "Suranna", + "Suravaram", + "Tenali Ramakrishna", + "Gautami", +] + +TAMIL_FONTS=[ + "TAMu_Kadambri", + "TAMu_Kalyani", + "TAMu_Maduram", + "TSCu_Paranar", + "TSCu_Times", + "TSCu_Paranar Bold", + "FreeSans", + "FreeSerif", + "Lohit Tamil", + "Arial Unicode MS Bold", + "Ascender Uni", + "Droid Sans Tamil Bold", + "Droid Sans Tamil", + "Karla Tamil Inclined Bold Italic", + "Karla Tamil Inclined Italic", + "Karla Tamil Upright Bold", + "Karla Tamil Upright", + "Noto Sans Tamil Bold", + "Noto Sans Tamil", + "Noto Sans Tamil UI Bold", + "Noto Sans Tamil UI", + "TSCu_Comic Normal", + "Lohit Tamil Classical", +] + +THAI_FONTS=[ + "FreeSerif", + "FreeSerif Italic", + "Garuda", + "Norasi", + "Lucida Sans Typewriter", + "Lucida Sans", + "Garuda Oblique", + "Norasi Oblique", + "Norasi Italic", + "Garuda Bold", + "Norasi Bold", + "Lucida Sans Typewriter Bold", + "Lucida Sans Semi-Bold", + "Garuda Bold Oblique", + "Norasi Bold Italic", + "Norasi Bold Oblique", + "AnuParp LT Thai", + "Arial Unicode MS Bold", + "Arial Unicode MS", + "Ascender Uni", + "Loma", + "Noto Serif Thai Bold", + "Noto Serif Thai", + "Purisa Light", + "Sirichana LT Bold", + "Sirichana LT", + "Sukothai LT Bold", + "Sukothai LT", + "UtSaHaGumm LT Thai", + "Tahoma", +] + +KOREAN_FONTS=[ + "Arial Unicode MS", + "Arial Unicode MS Bold", + "Baekmuk Batang Patched", + "Baekmuk Batang", + "Baekmuk Dotum", + "Baekmuk Gulim", + "Baekmuk Headline", +] + +CHI_SIM_FONTS=[ + "AR PL UKai CN", + "AR PL UMing Patched Light", + "Arial Unicode MS", + "Arial Unicode MS Bold", + "WenQuanYi Zen Hei Medium", +] + +CHI_TRA_FONTS=[ + "AR PL UKai TW", + "AR PL UMing TW MBE Light", + "AR PL UKai Patched", + "AR PL UMing Patched Light", + "Arial Unicode MS", + "Arial Unicode MS Bold", + "WenQuanYi Zen Hei Medium", +] + +JPN_FONTS=[ + "TakaoExGothic", + "TakaoExMincho", + "TakaoGothic", + "TakaoMincho", + "TakaoPGothic", + "TakaoPMincho", + "VL Gothic", + "VL PGothic", + "Noto Sans Japanese Bold", + "Noto Sans Japanese Light", +] + +RUSSIAN_FONTS=[ + "Arial Bold", + "Arial Bold Italic", + "Arial Italic", + "Arial", + "Courier New Bold", + "Courier New Bold Italic", + "Courier New Italic", + "Courier New", + "Times New Roman, Bold", + "Times New Roman, Bold Italic", + "Times New Roman, Italic", + "Times New Roman,", + "Georgia Bold", + "Georgia Italic", + "Georgia", + "Georgia Bold Italic", + "Trebuchet MS Bold", + "Trebuchet MS Bold Italic", + "Trebuchet MS Italic", + "Trebuchet MS", + "Verdana Bold", + "Verdana Italic", + "Verdana", + "Verdana Bold Italic", + "DejaVu Serif", + "DejaVu Serif Oblique", + "DejaVu Serif Bold", + "DejaVu Serif Bold Oblique", + "Lucida Bright", + "FreeSerif Bold", + "FreeSerif Bold Italic", + "DejaVu Sans Ultra-Light", +] + +GREEK_FONTS=[ + "Arial Unicode MS", + "Arial Unicode MS Bold", + "DejaVu Sans Mono", + "DejaVu Sans Mono Oblique", + "DejaVu Sans Mono Bold", + "DejaVu Sans Mono Bold Oblique", + "DejaVu Serif", + "DejaVu Serif Semi-Condensed", + "DejaVu Serif Oblique", + "DejaVu Serif Bold", + "DejaVu Serif Bold Oblique", + "DejaVu Serif Bold Semi-Condensed", + "FreeSerif Bold", + "FreeSerif Bold Italic", + "FreeSerif Italic", + "FreeSerif", + "GentiumAlt", + "GentiumAlt Italic", + "Linux Biolinum O Bold", + "Linux Biolinum O", + "Linux Libertine O Bold", + "Linux Libertine O", + "Linux Libertine O Bold Italic", + "Linux Libertine O Italic", + "Palatino Linotype Bold", + "Palatino Linotype Bold Italic", + "Palatino Linotype Italic", + "Palatino Linotype", + "UmePlus P Gothic", + "VL PGothic", +] + +ANCIENT_GREEK_FONTS=[ + "GFS Artemisia", + "GFS Artemisia Bold", + "GFS Artemisia Bold Italic", + "GFS Artemisia Italic", + "GFS Bodoni", + "GFS Bodoni Bold", + "GFS Bodoni Bold Italic", + "GFS Bodoni Italic", + "GFS Didot", + "GFS Didot Bold", + "GFS Didot Bold Italic", + "GFS Didot Italic", + "GFS DidotClassic", + "GFS Neohellenic", + "GFS Neohellenic Bold", + "GFS Neohellenic Bold Italic", + "GFS Neohellenic Italic", + "GFS Philostratos", + "GFS Porson", + "GFS Pyrsos", + "GFS Solomos", +] + +ARABIC_FONTS=[ + "Arabic Transparent Bold", + "Arabic Transparent", + "Arab", + "Arial Unicode MS Bold", + "Arial Unicode MS", + "ASVCodar LT Bold", + "ASVCodar LT Light", + "Badiya LT Bold", + "Badiya LT", + "Badr LT Bold", + "Badr LT", + "Dimnah", + "Frutiger LT Arabic Bold", + "Frutiger LT Arabic", + "Furat", + "Hassan LT Bold", + "Hassan LT Light", + "Jalal LT Bold", + "Jalal LT Light", + "Midan Bold", + "Midan", + "Mitra LT Bold", + "Mitra LT Light", + "Palatino LT Arabic", + "Palatino Sans Arabic Bold", + "Palatino Sans Arabic", + "Simplified Arabic Bold", + "Simplified Arabic", + "Times New Roman, Bold", + "Times New Roman,", + "Traditional Arabic Bold", + "Traditional Arabic", +] + +HEBREW_FONTS=[ + "Arial Bold", + "Arial Bold Italic", + "Arial Italic", + "Arial", + "Courier New Bold", + "Courier New Bold Italic", + "Courier New Italic", + "Courier New", + "Ergo Hebrew Semi-Bold", + "Ergo Hebrew Semi-Bold Italic", + "Ergo Hebrew", + "Ergo Hebrew Italic", + "Really No 2 LT W2G Light", + "Really No 2 LT W2G Light Italic", + "Really No 2 LT W2G Medium", + "Really No 2 LT W2G Medium Italic", + "Really No 2 LT W2G Semi-Bold", + "Really No 2 LT W2G Semi-Bold Italic", + "Really No 2 LT W2G Ultra-Bold", + "Really No 2 LT W2G Ultra-Bold Italic", + "Times New Roman, Bold", + "Times New Roman, Bold Italic", + "Times New Roman, Italic", + "Times New Roman,", + "Lucida Sans", + "Tahoma", +] + +BENGALI_FONTS=[ + "Bangla Medium", + "Lohit Bengali", + "Mukti Narrow", + "Mukti Narrow Bold", + "Jamrul Medium Semi-Expanded", + "Likhan Medium", + "Arial Unicode MS Bold", + "Ascender Uni", + "FreeSans", + "FreeSans Oblique", + "FreeSerif", + "FreeSerif Italic", + "Noto Sans Bengali Bold", + "Noto Sans Bengali", + "Ani", + "Lohit Assamese", + "Lohit Bengali", + "Mitra Mono", +] + +KYRGYZ_FONTS=[ + "Arial", + "Arial Bold", + "Arial Italic", + "Arial Bold Italic", + "Courier New", + "Courier New Bold", + "Courier New Italic", + "Courier New Bold Italic", + "Times New Roman,", + "Times New Roman, Bold", + "Times New Roman, Bold Italic", + "Times New Roman, Italic", + "DejaVu Serif", + "DejaVu Serif Oblique", + "DejaVu Serif Bold", + "DejaVu Serif Bold Oblique", + "Lucida Bright", + "FreeSerif Bold", + "FreeSerif Bold Italic", +] + +PERSIAN_FONTS=[ + "Amiri Bold Italic", + "Amiri Bold", + "Amiri Italic", + "Amiri", + "Andale Sans Arabic Farsi", + "Arial Unicode MS", + "Arial Unicode MS Bold", + "Lateef", + "Lucida Bright", + "Lucida Sans Oblique", + "Lucida Sans Semi-Bold", + "Lucida Sans", + "Lucida Sans Typewriter Bold", + "Lucida Sans Typewriter Oblique", + "Lucida Sans Typewriter", + "Scheherazade", + "Tahoma", + "Times New Roman,", + "Times New Roman, Bold", + "Times New Roman, Bold Italic", + "Times New Roman, Italic", + "Yakout Linotype Bold", + "Yakout Linotype", +] + +AMHARIC_FONTS=[ + "Abyssinica SIL" + "Droid Sans Ethiopic Bold", + "Droid Sans Ethiopic", + "FreeSerif", + "Noto Sans Ethiopic Bold", + "Noto Sans Ethiopic", +] + +ARMENIAN_FONTS=[ + "Arial Unicode MS", + "Arial Unicode MS Bold", + "Ascender Uni", + "FreeMono", + "FreeMono Italic", + "FreeSans", + "FreeSans Bold", + "FreeSans Oblique", +] + +BURMESE_FONTS=[ + "Myanmar Sans Pro", + "Noto Sans Myanmar Bold", + "Noto Sans Myanmar", + "Padauk Bold", + "Padauk", + "TharLon", +] + +JAVANESE_FONTS=[ + "Prada", +] + +NORTH_AMERICAN_ABORIGINAL_FONTS=[ + "Aboriginal Sans", + "Aboriginal Sans Bold Italic", + "Aboriginal Sans Italic", + "Aboriginal Sans Bold", + "Aboriginal Serif Bold", + "Aboriginal Serif Bold Italic", + "Aboriginal Serif Italic", + "Aboriginal Serif", +] + +GEORGIAN_FONTS=[ + "Arial Unicode MS Bold", + "Arial Unicode MS", + "BPG Algeti GPL\&GNU", + "BPG Chveulebrivi GPL\&GNU", + "BPG Courier GPL\&GNU", + "BPG Courier S GPL\&GNU", + "BPG DejaVu Sans 2011 GNU-GPL", + "BPG Elite GPL\&GNU", + "BPG Excelsior GPL\&GNU", + "BPG Glaho GPL\&GNU", + "BPG Gorda GPL\&GNU", + "BPG Ingiri GPL\&GNU", + "BPG Mrgvlovani Caps GNU\&GPL", + "BPG Mrgvlovani GPL\&GNU", + "BPG Nateli Caps GPL\&GNU Light", + "BPG Nateli Condenced GPL\&GNU Light", + "BPG Nateli GPL\&GNU Light", + "BPG Nino Medium Cond GPL\&GNU", + "BPG Nino Medium GPL\&GNU Medium", + "BPG Sans GPL\&GNU", + "BPG Sans Medium GPL\&GNU", + "BPG Sans Modern GPL\&GNU", + "BPG Sans Regular GPL\&GNU", + "BPG Serif GPL\&GNU", + "BPG Serif Modern GPL\&GNU", + "FreeMono", + "FreeMono Bold Italic", + "FreeSans", + "FreeSerif", + "FreeSerif Bold", + "FreeSerif Bold Italic", + "FreeSerif Italic", +] + +OLD_GEORGIAN_FONTS=[ + "Arial Unicode MS Bold", + "Arial Unicode MS", + "BPG Algeti GPL\&GNU", + "BPG Courier S GPL\&GNU", + "BPG DejaVu Sans 2011 GNU-GPL", + "BPG Elite GPL\&GNU", + "BPG Excelsior GPL\&GNU", + "BPG Glaho GPL\&GNU", + "BPG Ingiri GPL\&GNU", + "BPG Mrgvlovani Caps GNU\&GPL", + "BPG Mrgvlovani GPL\&GNU", + "BPG Nateli Caps GPL\&GNU Light", + "BPG Nateli Condenced GPL\&GNU Light", + "BPG Nateli GPL\&GNU Light", + "BPG Nino Medium Cond GPL\&GNU", + "BPG Nino Medium GPL\&GNU Medium", + "BPG Sans GPL\&GNU", + "BPG Sans Medium GPL\&GNU", + "BPG Sans Modern GPL\&GNU", + "BPG Sans Regular GPL\&GNU", + "BPG Serif GPL\&GNU", + "BPG Serif Modern GPL\&GNU", + "FreeSans", + "FreeSerif", + "FreeSerif Bold", + "FreeSerif Bold Italic", + "FreeSerif Italic", +] + +KHMER_FONTS=[ + "Khmer OS", + "Khmer OS System", + "Khmer OS Battambang", + "Khmer OS Bokor", + "Khmer OS Content", + "Khmer OS Fasthand", + "Khmer OS Freehand", + "Khmer OS Metal Chrieng", + "Khmer OS Muol Light", + "Khmer OS Muol Pali", + "Khmer OS Muol", + "Khmer OS Siemreap", + "Noto Sans Bold", + "Noto Sans", + "Noto Serif Khmer Bold", + "Noto Serif Khmer Light", +] + +KURDISH_FONTS=[ + "Amiri Bold Italic", + "Amiri Bold", + "Amiri Italic", + "Amiri", + "Arial Unicode MS", + "Arial Unicode MS Bold", + "Lateef", + "Lucida Bright", + "Lucida Sans Oblique", + "Lucida Sans Semi-Bold", + "Lucida Sans", + "Lucida Sans Typewriter Bold", + "Lucida Sans Typewriter Oblique", + "Lucida Sans Typewriter", + "Scheherazade", + "Tahoma", + "Times New Roman,", + "Times New Roman, Bold", + "Times New Roman, Bold Italic", + "Times New Roman, Italic", + "Unikurd Web", + "Yakout Linotype Bold", + "Yakout Linotype", +] + +LAOTHIAN_FONTS=[ + "Phetsarath OT", + "Arial Unicode MS", + "Arial Unicode MS Bold", + "Ascender Uni", + "Dhyana Bold", + "Dhyana", + "Lao Muang Don", + "Lao Muang Khong", + "Lao Sans Pro", + "Noto Sans Lao Bold", + "Noto Sans Lao", + "Noto Sans Lao UI Bold", + "Noto Sans Lao UI", + "Noto Serif Lao Bold", + "Noto Serif Lao", + "Phetsarath Bold", + "Phetsarath", + "Souliyo Unicode", +] + +GUJARATI_FONTS=[ + "Lohit Gujarati", + "Rekha Medium", + "Samyak Gujarati Medium", + "aakar Medium", + "padmaa Bold", + "padmaa Medium", + "Arial Unicode MS", + "Arial Unicode MS Bold", + "Ascender Uni", + "FreeSans", + "Noto Sans Gujarati Bold", + "Noto Sans Gujarati", + "Shruti", + "Shruti Bold", +] + +MALAYALAM_FONTS=[ + "AnjaliOldLipi", + "Arial Unicode MS", + "Arial Unicode MS Bold", + "Ascender Uni", + "Dyuthi", + "FreeSerif", + "Kalyani", + "Kartika", + "Kartika Bold", + "Lohit Malayalam", + "Meera", + "Noto Sans Malayalam Bold", + "Noto Sans Malayalam", + "Rachana", + "Rachana_w01", + "RaghuMalayalam", + "suruma", +] + +ORIYA_FONTS=[ + "Arial Unicode MS", + "Arial Unicode MS Bold", + "Ascender Uni", + "ori1Uni Medium", + "Samyak Oriya Medium", + "Lohit Oriya", +] + +PUNJABI_FONTS=[ + "Arial Unicode MS", + "Arial Unicode MS Bold", + "Ascender Uni", + "Saab", + "Lohit Punjabi", + "Noto Sans Gurmukhi", + "Noto Sans Gurmukhi Bold", + "FreeSans", + "FreeSans Bold", + "FreeSerif", +] + +SINHALA_FONTS=[ + "Noto Sans Sinhala Bold", + "Noto Sans Sinhala", + "OCRUnicode", + "Yagpo", + "LKLUG", + "FreeSerif", +] + +SYRIAC_FONTS=[ + "East Syriac Adiabene", + "East Syriac Ctesiphon", + "Estrangelo Antioch", + "Estrangelo Edessa", + "Estrangelo Midyat", + "Estrangelo Nisibin", + "Estrangelo Quenneshrin", + "Estrangelo Talada", + "Estrangelo TurAbdin", + "Serto Batnan Bold", + "Serto Batnan", + "Serto Jerusalem Bold", + "Serto Jerusalem Italic", + "Serto Jerusalem", + "Serto Kharput", + "Serto Malankara", + "Serto Mardin Bold", + "Serto Mardin", + "Serto Urhoy Bold", + "Serto Urhoy", + "FreeSans", +] + +THAANA_FONTS=[ + "FreeSerif", +] + +TIBETAN_FONTS=[ + "Arial Unicode MS", + "Arial Unicode MS Bold", + "Ascender Uni", + "DDC Uchen", + "Jomolhari", + "Kailasa", + "Kokonor", + "Tibetan Machine Uni", + "TibetanTsugRing", + "Yagpo", +] + +# The following fonts will be rendered vertically in phase I. +VERTICAL_FONTS=[ + "TakaoExGothic", + "TakaoExMincho", + "AR PL UKai Patched", + "AR PL UMing Patched Light", + "Baekmuk Batang Patched", +] + +FLAGS_webtext_prefix=os.environ.get('FLAGS_webtext_prefix', '') + +# Set language-specific values for several global variables, including +# ${TEXT_CORPUS} +# holds the text corpus file for the language, used in phase F +# ${FONTS[@]} +# holds a sequence of applicable fonts for the language, used in +# phase F & I. only set if not already set, i.e. from command line +# ${TRAINING_DATA_ARGUMENTS} +# non-default arguments to the training_data program used in phase T +# ${FILTER_ARGUMENTS}[ -] +# character-code-specific filtering to distinguish between scripts +# (eg. CJK) used by filter_borbidden_characters in phase F +# ${WORDLIST2DAWG_ARGUMENTS} +# specify fixed length dawg generation for non-space-delimited lang +# TODO(dsl): We can refactor these into functions that assign FONTS, +# TEXT_CORPUS, etc. separately. +def set_lang_specific_parameters(ctx, lang): + # The default text location is now given directly from the language code. + TEXT_CORPUS=f"{FLAGS_webtext_prefix}/{lang}.corpus.txt" + FILTER_ARGUMENTS=[] + WORDLIST2DAWG_ARGUMENTS="" + # These dawg factors represent the fraction of the corpus not covered by the + # dawg, and seem like reasonable defaults, but the optimal value is likely + # to be highly corpus-dependent, as well as somewhat language-dependent. + # Number dawg factor is the fraction of all numeric strings that are not + # covered, which is why it is higher relative to the others. + PUNC_DAWG_FACTOR=None + NUMBER_DAWG_FACTOR=0.125 + WORD_DAWG_FACTOR=0.05 + BIGRAM_DAWG_FACTOR=0.015 + TRAINING_DATA_ARGUMENTS=[] + FRAGMENTS_DISABLED="y" + RUN_SHAPE_CLUSTERING=False + AMBIGS_FILTER_DENOMINATOR="100000" + LEADING=32 + MEAN_COUNT=40 # Default for latin script. + # Language to mix with the language for maximum accuracy. Defaults to eng. + # If no language is good, set to the base language. + MIX_LANG="eng" + FONTS=ctx.fonts + TEXT2IMAGE_EXTRA_ARGS=[] + EXPOSURES=[] + + + # Latin languages. + if lang == 'enm': + TEXT2IMAGE_EXTRA_ARGS+=["--ligatures"] # Add ligatures when supported + if not FONTS: FONTS = EARLY_LATIN_FONTS + elif lang == 'frm': + TEXT_CORPUS=f"{FLAGS_webtext_prefix}/fra.corpus.txt" + # Make long-s substitutions for Middle French text + FILTER_ARGUMENTS+=["--make_early_language_variant=fra"] + TEXT2IMAGE_EXTRA_ARGS+=["--ligatures"] # Add ligatures when supported. + if not FONTS: FONTS = EARLY_LATIN_FONTS + elif lang == 'frk': + TEXT_CORPUS=f"{FLAGS_webtext_prefix}/deu.corpus.txt" + if not FONTS: FONTS = FRAKTUR_FONTS + elif lang == 'ita_old': + TEXT_CORPUS=f"{FLAGS_webtext_prefix}/ita.corpus.txt" + # Make long-s substitutions for Early Italian text + FILTER_ARGUMENTS+=["--make_early_language_variant=ita"] + TEXT2IMAGE_EXTRA_ARGS+=["--ligatures"] # Add ligatures when supported. + if not FONTS: FONTS = EARLY_LATIN_FONTS + elif lang == 'lat': + if not EXPOSURES: EXPOSURES="-3 -2 -1 0 1 2 3".split() + if not FONTS: FONTS = NEOLATIN_FONTS + elif lang == 'spa_old': + TEXT_CORPUS=f"{FLAGS_webtext_prefix}/spa.corpus.txt" + # Make long-s substitutions for Early Spanish text + FILTER_ARGUMENTS+=["--make_early_language_variant=spa"] + TEXT2IMAGE_EXTRA_ARGS+=["--ligatures"] # Add ligatures when supported. + if not FONTS: FONTS = EARLY_LATIN_FONTS + elif lang == 'srp_latn': + TEXT_CORPUS=f"{FLAGS_webtext_prefix}/srp.corpus.txt" + elif lang == 'vie': + TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] + if not FONTS: FONTS = VIETNAMESE_FONTS + # Highly inflective languages get a bigger dawg size. + # TODO(rays) Add more here! + elif lang == 'hun': + WORD_DAWG_SIZE=1000000 + elif lang == 'pol': + WORD_DAWG_SIZE=1000000 + + # Latin with default treatment. + elif lang == 'afr': + pass + elif lang == 'aze': + pass + elif lang == 'bos': + pass + elif lang == 'cat': + pass + elif lang == 'ceb': + pass + elif lang == 'ces': + PUNC_DAWG_FACTOR=0.004 + elif lang == 'cym': + pass + elif lang == 'dan': + pass + elif lang == 'deu': + WORD_DAWG_FACTOR=0.125 + elif lang == 'eng': + WORD_DAWG_FACTOR=0.03 + elif lang == 'epo': + pass + elif lang == 'est': + pass + elif lang == 'eus': + pass + elif lang == 'fil': + pass + elif lang == 'fin': + pass + elif lang == 'fra': + WORD_DAWG_FACTOR=0.08 + elif lang == 'gle': + pass + elif lang == 'gle_uncial': + if not FONTS: FONTS = IRISH_UNCIAL_FONTS + elif lang == 'glg': + pass + elif lang == 'hat': + pass + elif lang == 'hrv': + pass + elif lang == 'iast': + pass + elif lang == 'ind': + pass + elif lang == 'isl': + pass + elif lang == 'ita': + pass + elif lang == 'jav': + pass + elif lang == 'lav': + pass + elif lang == 'lit': + pass + elif lang == 'mlt': + pass + elif lang == 'msa': + pass + elif lang == 'nld': + WORD_DAWG_FACTOR=0.02 + elif lang == 'nor': + pass + elif lang == 'por': + pass + elif lang == 'ron': + pass + elif lang == 'slk': + pass + elif lang == 'slv': + pass + elif lang == 'spa': + pass + elif lang == 'sqi': + pass + elif lang == 'swa': + pass + elif lang == 'swe': + pass + elif lang == 'tgl': + pass + elif lang == 'tur': + pass + elif lang == 'uzb': + pass + elif lang == 'zlm': + pass + + # Special code for performing language-id that is trained on + # EFIGS+Latin+Vietnamese text with regular + fraktur fonts. + elif lang == 'lat_lid': + TEXT_CORPUS=f'{FLAGS_webtext_prefix}/lat_lid.corpus.txt' + TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] + GENERATE_WORD_BIGRAMS=0 + # Strip unrenderable words as not all fonts will render the extended + # latin symbols found in Vietnamese text. + WORD_DAWG_SIZE=1000000 + if not FONTS: FONTS = EARLY_LATIN_FONTS + + # Cyrillic script-based languages. It is bad to mix Latin with Cyrillic. + elif lang == 'rus': + if not FONTS: FONTS = RUSSIAN_FONTS + MIX_LANG="rus" + NUMBER_DAWG_FACTOR=0.05 + WORD_DAWG_SIZE=1000000 + elif lang in ('aze_cyrl','bel','bul','kaz','mkd','srp','tgk','ukr','uzb_cyrl' ): + MIX_LANG=f"{lang}" + if not FONTS: FONTS = RUSSIAN_FONTS + + # Special code for performing Cyrillic language-id that is trained on + # Russian, Serbian, Ukrainian, Belarusian, Macedonian, Tajik and Mongolian + # text with the list of Russian fonts. + elif lang == 'cyr_lid': + TEXT_CORPUS=f'{FLAGS_webtext_prefix}/cyr_lid.corpus.txt' + TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] + GENERATE_WORD_BIGRAMS=0 + WORD_DAWG_SIZE=1000000 + if not FONTS: FONTS = RUSSIAN_FONTS + + # South Asian scripts mostly have a lot of different graphemes, so trim + # down the MEAN_COUNT so as not to get a huge amount of text. + elif lang in ('asm','ben' ): + MEAN_COUNT=15 + WORD_DAWG_FACTOR=0.15 + if not FONTS: FONTS = BENGALI_FONTS + elif lang in ( 'bih','hin','mar','nep','san' ): + MEAN_COUNT=15 + WORD_DAWG_FACTOR=0.15 + if not FONTS: FONTS = DEVANAGARI_FONTS + elif lang == 'bod': + MEAN_COUNT=15 + WORD_DAWG_FACTOR=0.15 + if not FONTS: FONTS = TIBETAN_FONTS + elif lang == 'dzo': + WORD_DAWG_FACTOR=0.01 + if not FONTS: FONTS = TIBETAN_FONTS + elif lang == 'guj': + MEAN_COUNT=15 + WORD_DAWG_FACTOR=0.15 + if not FONTS: FONTS = GUJARATI_FONTS + elif lang == 'kan': + MEAN_COUNT=15 + WORD_DAWG_FACTOR=0.15 + TRAINING_DATA_ARGUMENTS+=["--no_newline_in_output"] + TEXT2IMAGE_EXTRA_ARGS+=["--char_spacing=0.5"] + if not FONTS: FONTS = KANNADA_FONTS + elif lang == 'mal': + MEAN_COUNT=15 + WORD_DAWG_FACTOR=0.15 + TRAINING_DATA_ARGUMENTS+=["--no_newline_in_output"] + TEXT2IMAGE_EXTRA_ARGS+=["--char_spacing=0.5"] + if not FONTS: FONTS = MALAYALAM_FONTS + elif lang == 'ori': + WORD_DAWG_FACTOR=0.01 + if not FONTS: FONTS = ORIYA_FONTS + elif lang == 'pan': + MEAN_COUNT=15 + WORD_DAWG_FACTOR=0.01 + if not FONTS: FONTS = PUNJABI_FONTS + elif lang == 'sin': + MEAN_COUNT=15 + WORD_DAWG_FACTOR=0.01 + if not FONTS: FONTS = SINHALA_FONTS + elif lang == 'tam': + MEAN_COUNT=30 + WORD_DAWG_FACTOR=0.15 + TRAINING_DATA_ARGUMENTS+=["--no_newline_in_output"] + TEXT2IMAGE_EXTRA_ARGS+=["--char_spacing=0.5"] + if not FONTS: FONTS = TAMIL_FONTS + elif lang == 'tel': + MEAN_COUNT=15 + WORD_DAWG_FACTOR=0.15 + TRAINING_DATA_ARGUMENTS+=["--no_newline_in_output"] + TEXT2IMAGE_EXTRA_ARGS+=["--char_spacing=0.5"] + if not FONTS: FONTS = TELUGU_FONTS + + # SouthEast Asian scripts. + elif lang == 'jav_java': + MEAN_COUNT=15 + WORD_DAWG_FACTOR=0.15 + TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] + if not FONTS: FONTS = JAVANESE_FONTS + elif lang == 'khm': + MEAN_COUNT=15 + WORD_DAWG_FACTOR=0.15 + TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] + if not FONTS: FONTS = KHMER_FONTS + elif lang == 'lao': + MEAN_COUNT=15 + WORD_DAWG_FACTOR=0.15 + TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] + if not FONTS: FONTS = LAOTHIAN_FONTS + elif lang == 'mya': + MEAN_COUNT=12 + WORD_DAWG_FACTOR=0.15 + TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] + if not FONTS: FONTS = BURMESE_FONTS + elif lang == 'tha': + MEAN_COUNT=30 + WORD_DAWG_FACTOR=0.01 + TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] + FILTER_ARGUMENTS+=["--segmenter_lang=tha"] + TRAINING_DATA_ARGUMENTS+=["--no_space_in_output", "--desired_bigrams="] + AMBIGS_FILTER_DENOMINATOR="1000" + LEADING=48 + if not FONTS: FONTS = THAI_FONTS + + # CJK + elif lang == 'chi_sim': + MEAN_COUNT=15 + PUNC_DAWG_FACTOR=0.015 + WORD_DAWG_FACTOR=0.015 + GENERATE_WORD_BIGRAMS=0 + TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] + TRAINING_DATA_ARGUMENTS+=["--no_space_in_output", "--desired_bigrams="] + FILTER_ARGUMENTS+=["--charset_filter=chi_sim", "--segmenter_lang=chi_sim"] + if not FONTS: FONTS = CHI_SIM_FONTS + elif lang == 'chi_tra': + MEAN_COUNT=15 + WORD_DAWG_FACTOR=0.015 + GENERATE_WORD_BIGRAMS=0 + TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] + TRAINING_DATA_ARGUMENTS+=["--no_space_in_output", "--desired_bigrams="] + FILTER_ARGUMENTS+=["--charset_filter=chi_tr", "--segmenter_lang=chi_tra"] + if not FONTS: FONTS = CHI_TRA_FONTS + elif lang == 'jpn': + MEAN_COUNT=15 + WORD_DAWG_FACTOR=0.015 + GENERATE_WORD_BIGRAMS=0 + TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] + TRAINING_DATA_ARGUMENTS+=["--no_space_in_output", "--desired_bigrams="] + FILTER_ARGUMENTS+=["--charset_filter=jpn", "--segmenter_lang=jpn"] + if not FONTS: FONTS = JPN_FONTS + elif lang == 'kor': + MEAN_COUNT=20 + WORD_DAWG_FACTOR=0.015 + NUMBER_DAWG_FACTOR=0.05 + TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] + TRAINING_DATA_ARGUMENTS+=["--desired_bigrams="] + GENERATE_WORD_BIGRAMS=0 + FILTER_ARGUMENTS+=["--charset_filter=kor","--segmenter_lang=kor"] + if not FONTS: FONTS = KOREAN_FONTS + + # Middle-Eastern scripts. + elif lang == 'ara': + if not FONTS: FONTS = ARABIC_FONTS + elif lang == 'div': + if not FONTS: FONTS = THAANA_FONTS + elif lang in ('fas','pus','snd','uig','urd' ): + if not FONTS: FONTS = PERSIAN_FONTS + elif lang in ('heb','yid' ): + NUMBER_DAWG_FACTOR=0.05 + WORD_DAWG_FACTOR=0.08 + if not FONTS: FONTS = HEBREW_FONTS + elif lang == 'syr': + if not FONTS: FONTS = SYRIAC_FONTS + + # Other scripts. + elif lang in ('amh','tir'): + if not FONTS: FONTS = AMHARIC_FONTS + elif lang == 'chr': + if not FONTS: + FONTS = [*NORTH_AMERICAN_ABORIGINAL_FONTS, "Noto Sans Cherokee"] + elif lang == 'ell': + NUMBER_DAWG_FACTOR=0.05 + WORD_DAWG_FACTOR=0.08 + if not FONTS: FONTS = GREEK_FONTS + elif lang == 'grc': + if not EXPOSURES: EXPOSURES="-3 -2 -1 0 1 2 3".split() + if not FONTS: FONTS = ANCIENT_GREEK_FONTS + elif lang == 'hye': + if not FONTS: FONTS = ARMENIAN_FONTS + elif lang == 'iku': + if not FONTS: FONTS = NORTH_AMERICAN_ABORIGINAL_FONTS + elif lang == 'kat': + if not FONTS: FONTS = GEORGIAN_FONTS + elif lang == 'kat_old': + TEXT_CORPUS=f"{FLAGS_webtext_prefix}/kat.corpus.txt" + if not FONTS: FONTS = OLD_GEORGIAN_FONTS + elif lang == 'kir': + if not FONTS: FONTS = KYRGYZ_FONTS + TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=100"] + elif lang == 'kur': + if not FONTS: FONTS = KURDISH_FONTS + else: + raise ValueError(f"Error: {lang} is not a valid language code") + + + FLAGS_mean_count = int(os.environ.get('FLAGS_mean_count', -1)) + if FLAGS_mean_count > 0: + TRAINING_DATA_ARGUMENTS+=[f"--mean_count={FLAGS_mean_count}"] + elif not MEAN_COUNT: + TRAINING_DATA_ARGUMENTS+=[f"--mean_count={MEAN_COUNT}"] + + # Default to Latin fonts if none have been set + if not FONTS: FONTS = LATIN_FONTS + + # Default to 0 exposure if it hasn't been set + if not EXPOSURES: EXPOSURES=[0] + # Set right-to-left and normalization mode. + if lang in ('ara','div', 'fas','pus','snd','syr','uig','urd','kur_ara','heb','yid'): + LANG_IS_RTL=True + NORM_MODE=2 + elif lang in ( + 'asm','ben','bih','hin','mar','nep','guj','kan','mal','tam','tel','pan', + 'dzo','sin','san','bod','ori','khm','mya','tha','lao','jav ','jav_java' + ): + LANG_IS_RTL=False + NORM_MODE=2 + else: + LANG_IS_RTL=False + NORM_MODE=1 + + for var in [v for v in locals()]: + if var.isupper(): + value = locals()[var] + lowervar = var.lower() + if hasattr(ctx, lowervar) and getattr(ctx, lowervar) != value: + log.debug(f"{lowervar} = {value} (was {getattr(ctx, lowervar)})") + setattr(ctx, lowervar, value) + elif hasattr(ctx, lowervar): + log.debug(f"{lowervar} = {value} (set on cmdline)") + else: + log.debug(f"{lowervar} = {value}") + setattr(ctx, lowervar, value) + + return ctx + +#============================================================================= +# END of Language specific info +#============================================================================= diff --git a/src/training/tesstrain.py b/src/training/tesstrain.py new file mode 100644 index 0000000000..a6aa6276b9 --- /dev/null +++ b/src/training/tesstrain.py @@ -0,0 +1,92 @@ +# (C) Copyright 2014, Google Inc. +# (C) Copyright 2018, James R Barlow +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This script provides an easy way to execute various phases of training +# Tesseract. For a detailed description of the phases, see +# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract +# +import sys,os,subprocess, logging + + +sys.path.insert(0, os.path.dirname(__file__)) +from tesstrain_utils import parse_flags, initialize_fontconfig, phase_I_generate_image, \ + phase_UP_generate_unicharset, phase_E_extract_features, make_lstmdata, cleanup +import language_specific + +log = logging.getLogger() + +def setup_logging(logfile): + log.setLevel(logging.DEBUG) + console = logging.StreamHandler() + console.setLevel(logging.INFO) + console_formatter = logging.Formatter( + '[%(asctime)s] %(levelname)s - %(message)s', + datefmt='%H:%M:%S' + ) + console.setFormatter(console_formatter) + log.addHandler(console) + + logfile = logging.FileHandler(logfile) + logfile.setLevel(logging.DEBUG) + logfile_formatter = logging.Formatter( + '[%(asctime)s] - %(levelname)s - %(name)s - %(message)s' + ) + logfile.setFormatter(logfile_formatter) + log.addHandler(logfile) + +def main(): + ctx = parse_flags() + setup_logging(ctx.log_file) + if not ctx.linedata: + log.error('--linedata_only is required since only LSTM is supported') + sys.exit(1) + + log.info(f"=== Starting training for language {ctx.lang_code}") + ctx = language_specific.set_lang_specific_parameters(ctx, ctx.lang_code) + + initialize_fontconfig(ctx) + phase_I_generate_image(ctx, par_factor=8) + phase_UP_generate_unicharset(ctx) + + if ctx.linedata: + phase_E_extract_features(ctx, ['--psm', '6', 'lstm.train'], 'lstmf') + make_lstmdata(ctx) + + cleanup(ctx) + log.info("All done!") + return 0 + +if __name__ == '__main__': + main() + + +# _rc0 = subprocess.call(["tlog","\n=== Starting training for language '"+str(LANG_CODE.val)+"'"],shell=True) +# _rc0 = subprocess.call(["source",os.popen("dirname "+__file__).read().rstrip("\n")+"/language-specific.sh"],shell=True) +# _rc0 = subprocess.call(["set_lang_specific_parameters",str(LANG_CODE.val)],shell=True) +# _rc0 = subprocess.call(["initialize_fontconfig"],shell=True) +# _rc0 = subprocess.call(["phase_I_generate_image","8"],shell=True) +# _rc0 = subprocess.call(["phase_UP_generate_unicharset"],shell=True) +# if (LINEDATA ): + #subprocess.call(["phase_E_extract_features"," --psm 6 lstm.train ","8","lstmf"],shell=True) +# subprocess.call(["make__lstmdata"],shell=True) +# subprocess.call(["tlog","\nCreated starter traineddata for language '"+str(LANG_CODE.val)+"'\n"],shell=True) +# subprocess.call(["tlog","\nRun lstmtraining to do the LSTM training for language '"+str(LANG_CODE.val)+"'\n"],shell=True) +# else: +# subprocess.call(["phase_D_generate_dawg"],shell=True) +# subprocess.call(["phase_E_extract_features","box.train","8","tr"],shell=True) +# subprocess.call(["phase_C_cluster_prototypes",str(TRAINING_DIR.val)+"/"+str(LANG_CODE.val)+".normproto"],shell=True) +# if (str(ENABLE_SHAPE_CLUSTERING.val) == "y" ): +# subprocess.call(["phase_S_cluster_shapes"],shell=True) +# subprocess.call(["phase_M_cluster_microfeatures"],shell=True) +# subprocess.call(["phase_B_generate_ambiguities"],shell=True) +# subprocess.call(["make__traineddata"],shell=True) +# subprocess.call(["tlog","\nCompleted training for language '"+str(LANG_CODE.val)+"'\n"],shell=True) diff --git a/src/training/tesstrain_utils.py b/src/training/tesstrain_utils.py new file mode 100644 index 0000000000..d7acff8bfc --- /dev/null +++ b/src/training/tesstrain_utils.py @@ -0,0 +1,617 @@ +# (C) Copyright 2014, Google Inc. +# (C) Copyright 2018, James R Barlow +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# For a detailed description of the phases, see +# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract +# + +import os +import sys +from datetime import date +from tempfile import TemporaryDirectory, mkdtemp +from pathlib import Path +from shutil import which +import logging +import subprocess +import argparse +from operator import itemgetter +import concurrent.futures +import shutil +import atexit + +from tqdm import tqdm + +from language_specific import VERTICAL_FONTS + +log = logging.getLogger(__name__) + +class TrainingArgs(argparse.Namespace): + def __init__(self): + self.uname = os.uname().sysname.lower() + self.lang_code="eng" + self.timestamp=str(date.today()) + + self._font_config_cache = TemporaryDirectory(prefix='font_tmp') + self.font_config_cache =self._font_config_cache.name + self.fonts_dir="/Library/Fonts/" if 'darwin' in self.uname else "/usr/share/fonts/" + + self.max_pages=0 + self.save_box_tiff=False + self.output_dir="/tmp/tesstrain/tessdata" + self.overwrite=False + self.linedata=False + self.run_shape_clustering=False + self.extract_font_properties=True + self._workspace_dir=TemporaryDirectory(prefix='tesstrain') + self.workspace_dir = self._workspace_dir.name + + +def err_exit(msg): + log.critical(msg) + sys.exit(1) + +# Helper function to run a command and append its output to a log. Aborts early +# if the program file is not found. +# Usage: run_command CMD ARG1 ARG2... +def run_command(cmd, *args, env=None): + for d in ('', 'api/', 'training/'): + testcmd = which(f'{d}{cmd}') + if which(testcmd): + cmd = testcmd + break + if not which(cmd): + err_exit(f"{cmd} not found") + + log.debug(f"Running {cmd}") + for arg in args: + log.debug(arg) + + proc = subprocess.run([cmd, *args], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env) + proclog = logging.getLogger(cmd) + if proc.returncode == 0: + proclog.debug(proc.stdout.decode('utf-8', errors='replace')) + else: + try: + proclog.error(proc.stdout.decode('utf-8', errors='replace')) + except Exception: + pass + err_exit(f"Program {cmd} failed with return code {proc.returncode}. Abort.") + + +# Check if all the given files exist, or exit otherwise. +# Used to check required input files and produced output files in each phase. +# Usage: check_file_readable FILE1 FILE2... +def check_file_readable(*filenames): + if isinstance(filenames, (str, Path)): + filenames = [filenames] + for filename in filenames: + try: + with Path(filename).open() as f: + pass + except FileNotFoundError: + err_exit(f"Expected file {filename} does not exist") + except PermissionError: + err_exit(f"{filename} is not readable") + except IOError as e: + err_exit(f"{filename} IO Error: {str(e)}") + return True + + + +parser = argparse.ArgumentParser( + epilog=""" + The font names specified in --fontlist need to be recognizable by Pango using + fontconfig. An easy way to list the canonical names of all fonts available on + your system is to run text2image with --list_available_fonts and the + appropriate --fonts_dir path. + """, +) +parser.add_argument('--fontlist', dest='fonts', nargs='+', type=str, help='A list of fontnames to train on.') +parser.add_argument('--fonts_dir', help='Path to font files.') +parser.add_argument('--lang', metavar='LANG_CODE', dest='lang_code', help='ISO 639 code.') +parser.add_argument('--langdata_dir', metavar='DATADIR', help='Path to tesseract/training/langdata directory.') +parser.add_argument('--maxpages', type=int, dest='max_pages') +parser.add_argument('--output_dir', metavar='OUTPUTDIR', help='Location of output traineddata file.') +parser.add_argument('--overwrite', action='store_true', help='Safe to overwrite files in output_dir.') +parser.add_argument('--save_box_tiff', action='store_true', help='Save box/tiff pairs along with lstmf files.') +parser.add_argument('--linedata_only', dest='linedata', action='store_true', help='Only generate training data for lstmtraining.') + +inputdata_group = parser.add_argument_group('inputdata', 'OPTIONAL flags for input data. If unspecified we will look for them in the langdata_dir directory.') +inputdata_group.add_argument('--training_text', metavar='TEXTFILE',help='Text to render and use for training.') +inputdata_group.add_argument('--wordlist', dest='wordlist_file', metavar='WORDFILE', help='Word list for the language ordered by decreasing frequency.') + +parser.add_argument('--extract_font_properties', action='store_true') +parser.add_argument('--noextract_font_properties', dest='extract_font_properties', action='store_false') + +tessdata_group = parser.add_argument_group('tessdata', 'OPTIONAL flag to specify location of existing traineddata files, required during feature extraction. If unspecified will use TESSDATA_PREFIX defined in the current environment.') +tessdata_group.add_argument('--tessdata_dir', metavar='TESSDATADIR', help='Path to tesseract/tessdata directory.') + +parser.add_argument('--exposures', metavar='EXPOSURES', action='append', nargs='+', help='A list of exposure levels to use (e.g. -1,0,1).') +parser.add_argument('--workspace_dir') + + +# Does simple command-line parsing and initialization. +def parse_flags(argv=None): + ctx =TrainingArgs() + log.debug(ctx) + parser.parse_args(args=argv, namespace=ctx) + log.debug(ctx) + log.info("Parsing") + + if not ctx.lang_code: + err_exit("Need to specify a language --lang") + if not ctx.langdata_dir: + err_exit("Need to specify path to language files --langdata_dir") + if not ctx.tessdata_dir: + tessdata_prefix=os.environ.get('TESSDATA_PREFIX', '') + if not tessdata_prefix: + err_exit("Need to specify a --tessdata_dir or have a " + "TESSDATA_PREFIX variable defined in your environment") + else: + ctx.tessdata_dir = tessdata_prefix + + # Location where intermediate files will be created. + ctx.training_dir = mkdtemp(prefix=f"{ctx.lang_code}-{ctx.timestamp}") + # Location of log file for the whole run. + ctx.log_file = Path(ctx.training_dir) / "tesstrain.log" + log.info(f"Log file {ctx.log_file}") + + def show_tmpdir_location(training_dir): + # On successful exit we will delete this first; on failure we want to let the user + # know where the log is + if Path(training_dir).exists(): + print(f"Temporary files retained at: {training_dir}") + atexit.register(show_tmpdir_location, ctx.training_dir) + + # Take training text and wordlist from the langdata directory if not + # specified in the command-line. + if not ctx.training_text: + ctx.training_text = Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text" + if not ctx.wordlist_file: + ctx.wordlist_file = Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist" + + ctx.word_bigrams_file=Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams" + ctx.numbers_file=Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers" + ctx.punc_file=Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.punc" + ctx.bigram_freqs_file=Path(ctx.training_text).with_suffix(".training_text.bigram_freqs") + ctx.unigram_freqs_file=Path(ctx.training_text).with_suffix( ".training_text.unigram_freqs") + ctx.train_ngrams_file=Path(ctx.training_text).with_suffix( ".training_text.train_ngrams") + ctx.generate_dawgs=1 + log.debug(ctx) + return ctx + + +def cleanup(ctx): + shutil.copy(ctx.log_file, ctx.output_dir) + shutil.rmtree(ctx.training_dir) + return + +# Function initializes font config with a unique font cache dir. +def initialize_fontconfig(ctx): + sample_path=Path(ctx.font_config_cache)/'sample_text.txt' + Path(sample_path).write_text('Text\n') + log.info(f"Testing font: {ctx.fonts[0]}") + run_command( + 'text2image', f'--fonts_dir={ctx.fonts_dir}', + f"--font={ctx.fonts[0]}", f"--outputbase={sample_path}", f"--text={sample_path}", + f"--fontconfig_tmpdir={ctx.font_config_cache}" + ) + + +def make_fontname(font): + return font.replace(' ', '_').replace(',', '') + +def make_outbase(ctx, fontname,exposure): + return Path(ctx.training_dir)/f"{ctx.lang_code}.{fontname}.exp{exposure}" + +# Helper function for phaseI_generate_image. Generates the image for a single +# language/font combination in a way that can be run in parallel. +def generate_font_image(ctx, font, exposure, char_spacing): + + log.info(f"Rendering using {font}") + fontname=make_fontname(font) + outbase=make_outbase(ctx, fontname, exposure) + + common_args=[ + f"--fontconfig_tmpdir={ctx.font_config_cache}", + f"--fonts_dir={ctx.fonts_dir}", + f"--strip_unrenderable_words", + f"--leading={ctx.leading}", + f"--char_spacing={char_spacing}", + f"--exposure={exposure}", + f"--outputbase={outbase}", + f"--max_pages={ctx.max_pages}", + ] + + # add --writing_mode=vertical-upright to common_args if the font is + # specified to be rendered vertically. + if font in VERTICAL_FONTS: + common_args.append('--writing_mode=vertical-upright') + + run_command( + 'text2image', + *common_args, + f"--font={font}", + f"--text={ctx.training_text}", + *ctx.text2image_extra_args + ) + + check_file_readable(str(outbase) + '.box', str(outbase) + '.tif') + + if ctx.extract_font_properties and Path(ctx.train_ngrams_file).exists(): + log.info(f"Extracting font properties of {font}") + run_command( + 'text2image', + *common_args, + f"--font={font}", + f"--ligatures=false", + f"--text={ctx.train_ngrams_file}", + f"--only_extract_font_properties", + f"--ptsize=32" + ) + check_file_readable(str(outbase) + '.fontinfo') + return f'{font}-{exposure}' + +# Phase I : Generate (I)mages from training text for each font. +def phase_I_generate_image(ctx, par_factor): + + if not par_factor or par_factor <= 0: + par_factor = 1 + + log.info("=== Phase I: Generating training images ===") + check_file_readable(ctx.training_text) + char_spacing=0.0 + + for exposure in ctx.exposures: + if ctx.extract_font_properties and Path(ctx.bigram_freqs_file).exists(): + # Parse .bigram_freqs file and compose a .train_ngrams file with text + # for tesseract to recognize during training. Take only the ngrams whose + # combined weight accounts for 95% of all the bigrams in the language. + lines = Path(ctx.bigram_freqs_file).read_text(encoding='utf-8').split('\n') + records = (line.split(' ') for line in splittable_lines) + p = .99 + ngram_frac = p * sum(int(rec[1]) for rec in records if len(rec) >= 2) + + with Path(ctx.train_ngrams_file).open('w', encoding='utf-8') as f: + cumsum = 0 + for bigram, count in sorted(records, key=itemgetter(1), reverse=True): + if cumsum > ngram_frac: + break + f.write(bigram + ' ') + cumsum += count + + check_file_readable(ctx.train_ngrams_file) + + with tqdm(total=len(ctx.fonts)) as pbar, \ + concurrent.futures.ThreadPoolExecutor() as executor: + futures = [ + executor.submit(generate_font_image, ctx, font, exposure, char_spacing) + for font in ctx.fonts + ] + for future in concurrent.futures.as_completed(futures): + try: + future.result() + except Exception as exc: + err_exit("Failed while generating images " + str(exc)) + else: + pbar.update(1) + + # Check that each process was successful. + for font in ctx.fonts: + fontname=make_fontname(font) + outbase=make_outbase(ctx, fontname, exposure) + check_file_readable(str(outbase) + '.box', str(outbase) + '.tif') + return + + + +# Phase UP : Generate (U)nicharset and (P)roperties file. +def phase_UP_generate_unicharset(ctx): + log.info("=== Phase UP: Generating unicharset and unichar properties files ===") + + box_files=Path(ctx.training_dir).glob('*.box') + + ctx.unicharset_file=Path(ctx.training_dir) / f'{ctx.lang_code}.unicharset' + + run_command( + 'unicharset_extractor', + '--output_unicharset', f"{ctx.unicharset_file}", + '--norm_mode', f"{ctx.norm_mode}", + *box_files + ) + check_file_readable(ctx.unicharset_file) + + ctx.xheights_file=Path(ctx.training_dir) / f'{ctx.lang_code}.xheights' + run_command( + 'set_unicharset_properties', + '-U', f'{ctx.unicharset_file}', + '-O', f'{ctx.unicharset_file}', + '-X', f'{ctx.xheights_file}', + f'--script_dir={ctx.langdata_dir}' + ) + check_file_readable(ctx.xheights_file) + + +# # Phase D : Generate (D)awg files from unicharset file and wordlist files +# phase_D_generate_dawg() { +# tlog "\n=== Phase D: Generating Dawg files ===" + +# # Skip if requested +# if [[ ${GENERATE_DAWGS} -eq 0 ]]; then +# tlog "Skipping ${phase_name}" +# return +# fi + +# # Output files +# WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg +# FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg +# PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg +# NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg +# BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg + +# # Word DAWG +# local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq +# if [[ -s ${WORDLIST_FILE} ]]; then +# tlog "Generating word Dawg" +# check_file_readable ${unicharset_file} +# run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \ +# ${UNICHARSET_FILE} +# check_file_readable ${WORD_DAWG} + +# FREQ_DAWG_SIZE=100 +# head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file} +# fi + +# # Freq-word DAWG +# if [[ -s ${freq_wordlist_file} ]]; then +# check_file_readable ${UNICHARSET_FILE} +# tlog "Generating frequent-word Dawg" +# run_command wordlist2dawg -r 1 ${freq_wordlist_file} \ +# ${FREQ_DAWG} ${UNICHARSET_FILE} +# check_file_readable ${FREQ_DAWG} +# fi + +# # Punctuation DAWG +# # -r arguments to wordlist2dawg denote RTL reverse policy +# # (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h). +# # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG, +# # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS, +# # 2/RRP_FORCE_REVERSE for the punctuation DAWG. +# local punc_reverse_policy=0; +# if [[ "${LANG_IS_RTL}" == "1" ]]; then +# punc_reverse_policy=2 +# fi +# if [[ ! -s ${PUNC_FILE} ]]; then +# PUNC_FILE="{ctx.langdata_dir}/common.punc" +# fi +# check_file_readable ${PUNC_FILE} +# run_command wordlist2dawg -r ${punc_reverse_policy} \ +# ${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE} +# check_file_readable ${PUNC_DAWG} + +# # Numbers DAWG +# if [[ -s ${NUMBERS_FILE} ]]; then +# run_command wordlist2dawg -r 0 \ +# ${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE} +# check_file_readable ${NUMBER_DAWG} +# fi + +# # Bigram dawg +# if [[ -s ${WORD_BIGRAMS_FILE} ]]; then +# run_command wordlist2dawg -r 1 \ +# ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE} +# check_file_readable ${BIGRAM_DAWG} +# fi +# } + +# Phase E : (E)xtract .tr feature files from .tif/.box files +def phase_E_extract_features(ctx, box_config, ext): + log.info(f"=== Phase E: Generating {ext} files ===") + + img_files=list(Path(ctx.training_dir).glob('*.exp*.tif')) + log.debug(img_files) + + # Use any available language-specific configs. + config="" + testconfig = Path(ctx.langdata_dir) / ctx.lang_code / f'{ctx.lang_code}.config' + if testconfig.exists(): + config = testconfig + log.info(f"Using {ctx.lang_code}.config") + + tessdata_environ = os.environ.copy() + tessdata_environ['TESSDATA_PREFIX'] = str(ctx.tessdata_dir) + + log.info(f"Using TESSDATA_PREFIX={tessdata_environ['TESSDATA_PREFIX']}") + + with tqdm(total=len(img_files)) as pbar, \ + concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + futures = [] + for img_file in img_files: + future = executor.submit( + run_command, + 'tesseract', + img_file, + Path(img_file).with_suffix(''), + *box_config, + config, + env=tessdata_environ + ) + futures.append(future) + + for future in concurrent.futures.as_completed(futures): + try: + future.result() + except Exception as exc: + err_exit("Failed while extracting features: " + str(exc)) + else: + pbar.update(1) + # Check that all the output files were produced. + for img_file in img_files: + check_file_readable(Path(img_file.with_suffix('.' + ext))) + + return + +# # Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining) +# # phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto +# phase_C_cluster_prototypes() { +# tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ===" +# local out_normproto=$1 + +# run_command cntraining -D "${TRAINING_DIR}/" \ +# $(ls ${TRAINING_DIR}/*.tr) + +# check_file_readable ${TRAINING_DIR}/normproto +# mv ${TRAINING_DIR}/normproto ${out_normproto} +# } + +# # Phase S : (S)hape clustering +# phase_S_cluster_shapes() { +# if ((! RUN_SHAPE_CLUSTERING)); then +# tlog "\n=== Shape Clustering disabled ===" +# return +# fi +# check_file_readable {ctx.langdata_dir}/font_properties +# local font_props="-F {ctx.langdata_dir}/font_properties" +# if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\ +# [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then +# font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights" +# fi + +# run_command shapeclustering \ +# -D "${TRAINING_DIR}/" \ +# -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \ +# -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \ +# ${font_props} \ +# $(ls ${TRAINING_DIR}/*.tr) +# check_file_readable ${TRAINING_DIR}/shapetable \ +# ${TRAINING_DIR}/${LANG_CODE}.mfunicharset +# } + +# # Phase M : Clustering microfeatures (mfTraining) +# phase_M_cluster_microfeatures() { +# tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ===" + +# check_file_readable {ctx.langdata_dir}/font_properties +# font_props="-F {ctx.langdata_dir}/font_properties" +# if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \ +# [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then +# font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights" +# fi + +# run_command mftraining \ +# -D "${TRAINING_DIR}/" \ +# -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \ +# -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \ +# ${font_props} \ +# $(ls ${TRAINING_DIR}/*.tr) +# check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \ +# ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset +# mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp +# mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable +# mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable +# mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset +# } + +# phase_B_generate_ambiguities() { +# tlog "\n=== Phase B : ambiguities training ===" + +# # Check for manually created ambiguities data. +# if [[ -r {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then +# tlog "Found file {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs" +# cp {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs \ +# ${TRAINING_DIR}/${LANG_CODE}.unicharambigs +# # Make it writable, as it may be read-only in the client. +# chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs +# return +# else +# tlog "No unicharambigs file found!" +# fi + +# # TODO: Add support for generating ambiguities automatically. +# } + +def make_lstmdata(ctx): + log.info("=== Constructing LSTM training data ===") + lang_prefix=f"{ctx.langdata_dir}/{ctx.lang_code}/{ctx.lang_code}" + path_output = Path(ctx.output_dir) + if not path_output.is_dir(): + log.info(f"Creating new directory {ctx.output_dir}") + path_output.mkdir(exist_ok=True, parents=True) + + args = [] + if ctx.lang_is_rtl: + args.append("--lang_is_rtl") + if ctx.norm_mode >= 2: + args.append("--pass_through_recoder") + + # Build the starter traineddata from the inputs. + run_command( + 'combine_lang_model', + '--input_unicharset', f"{ctx.training_dir}/{ctx.lang_code}.unicharset", + '--script_dir', f"{ctx.langdata_dir}", + '--words', f"{lang_prefix}.wordlist", + '--numbers', f"{lang_prefix}.numbers", + '--puncs', f"{lang_prefix}.punc", + '--output_dir', f"{ctx.output_dir}", + '--lang', f"{ctx.lang_code}", + *args + ) + + def get_file_list(): + training_path = Path(ctx.training_dir) + if ctx.save_box_tiff: + log.info("=== Saving box/tiff pairs for training data ===") + yield from training_path.glob(f'{ctx.lang_code}*.box') + yield from training_path.glob(f'{ctx.lang_code}*.tif') + log.info("=== Moving lstmf files for training data ===") + yield from training_path.glob(f'{ctx.lang_code}.*.lstmf') + + for f in get_file_list(): + log.debug(f"Moving {f} to {path_output / f.name}") + shutil.move(str(f), path_output / f.name) + + lstm_list=f"{ctx.output_dir}/{ctx.lang_code}.training_files.txt" + dir_listing = (str(p) for p in path_output.glob(f'{ctx.lang_code}.*.lstmf')) + Path(lstm_list).write_text('\n'.join(dir_listing)) + + +# make__traineddata() { +# tlog "\n=== Making final traineddata file ===" +# local lang_prefix={ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE} + +# # Combine available files for this language from the langdata dir. +# if [[ -r ${lang_prefix}.config ]]; then +# tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}" +# cp ${lang_prefix}.config ${TRAINING_DIR} +# chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config +# fi +# if [[ -r ${lang_prefix}.params-model ]]; then +# tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}" +# cp ${lang_prefix}.params-model ${TRAINING_DIR} +# chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model +# fi + +# # Compose the traineddata file. +# run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}. + +# # Copy it to the output dir, overwriting only if allowed by the cmdline flag. +# if [[ ! -d ${OUTPUT_DIR} ]]; then +# tlog "Creating new directory ${OUTPUT_DIR}" +# mkdir -p ${OUTPUT_DIR} +# fi +# local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata; +# if [[ -f ${destfile} ]] && ((! OVERWRITE)); then +# err_exit "File ${destfile} exists and no --overwrite specified"; +# fi +# tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}" +# cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile} +# }