diff --git a/.gitignore b/.gitignore
index 4622249116..b7cd83cc93 100644
--- a/.gitignore
+++ b/.gitignore
@@ -73,6 +73,7 @@ build/*
 *.a
 *.class
 *.jar
+__pycache__
 
 # tessdata
 *.traineddata
diff --git a/src/training/language_specific.py b/src/training/language_specific.py
new file mode 100644
index 0000000000..76803717d2
--- /dev/null
+++ b/src/training/language_specific.py
@@ -0,0 +1,1294 @@
+#!/usr/bin/env python3
+# (C) Copyright 2014, Google Inc.
+# (C) Copyright 2018, James R Barlow
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Set some language specific variables. Works in conjunction with
+# tesstrain.sh
+#
+
+#=============================================================================
+# Language specific info
+#=============================================================================
+
+import os
+import logging
+log = logging.getLogger(__name__)
+
+# Array of all valid language codes.
+VALID_LANGUAGE_CODES=("afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat "
+                     "ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo "
+                     "ell eng enm epo est eus fas fil fin fra frk frm gle glg "
+                     "grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old "
+                     "jav jav_java jpn kan kat kat_old kaz khm kir kor kur lao lat "
+                     "lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori "
+                     "pan pol por pus ron rus san sin slk slv snd spa spa_old "
+                     "sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur "
+                     "uig ukr urd uzb uzb_cyrl vie yid gle_uncial ")
+
+# Codes for which we have webtext but no fonts:
+UNUSABLE_LANGUAGE_CODES=""
+
+FRAKTUR_FONTS=[
+    "CaslonishFraxx Medium",
+    "Cloister Black, Light",
+    "Proclamate Light",
+    "UnifrakturMaguntia",
+    "Walbaum-Fraktur",
+]
+
+# List of fonts to train on
+LATIN_FONTS=[
+    "Arial Bold",
+    "Arial Bold Italic",
+    "Arial Italic",
+    "Arial",
+    "Courier New Bold",
+    "Courier New Bold Italic",
+    "Courier New Italic",
+    "Courier New",
+    "Times New Roman, Bold",
+    "Times New Roman, Bold Italic",
+    "Times New Roman, Italic",
+    "Times New Roman,",
+    "Georgia Bold",
+    "Georgia Italic",
+    "Georgia",
+    "Georgia Bold Italic",
+    "Trebuchet MS Bold",
+    "Trebuchet MS Bold Italic",
+    "Trebuchet MS Italic",
+    "Trebuchet MS",
+    "Verdana Bold",
+    "Verdana Italic",
+    "Verdana",
+    "Verdana Bold Italic",
+    "URW Bookman L Bold",
+    "URW Bookman L Italic",
+    "URW Bookman L Bold Italic",
+    "Century Schoolbook L Bold",
+    "Century Schoolbook L Italic",
+    "Century Schoolbook L Bold Italic",
+    "Century Schoolbook L Medium",
+    "DejaVu Sans Ultra-Light",
+]
+
+# List of fonts for printed/neo-Latin ('lat' language code, different from Latin script)
+NEOLATIN_FONTS=[
+    "GFS Bodoni",
+    "GFS Bodoni Bold",
+    "GFS Bodoni Italic",
+    "GFS Bodoni Bold Italic",
+    "GFS Didot",
+    "GFS Didot Bold",
+    "GFS Didot Italic",
+    "GFS Didot Bold Italic",
+    "Cardo",
+    "Cardo Bold",
+    "Cardo Italic",
+    "Wyld",
+    "Wyld Italic",
+    "EB Garamond",
+    "EB Garamond Italic",
+    "Junicode",
+    "Junicode Bold",
+    "Junicode Italic",
+    "Junicode Bold Italic",
+    "IM FELL DW Pica PRO",
+    "IM FELL English PRO",
+    "IM FELL Double Pica PRO",
+    "IM FELL French Canon PRO",
+    "IM FELL Great Primer PRO",
+    "IM FELL DW Pica PRO Italic",
+    "IM FELL English PRO Italic",
+    "IM FELL Double Pica PRO Italic",
+    "IM FELL French Canon PRO Italic",
+    "IM FELL Great Primer PRO Italic",
+]
+
+IRISH_UNCIAL_FONTS=[
+  "Bunchlo Arsa Dubh GC",
+  "Bunchlo Arsa GC",
+  "Bunchlo Arsa GC Bold",
+  "Bunchlo Dubh GC",
+  "Bunchlo GC",
+  "Bunchlo GC Bold",
+  "Bunchlo Nua GC Bold",
+  "Bunchló na Nod GC",
+  "Gadelica",
+  "Glanchlo Dubh GC",
+  "Glanchlo GC",
+  "Glanchlo GC Bold",
+  "Seanchló Dubh GC",
+  "Seanchló GC",
+  "Seanchló GC Bold",
+  "Seanchló na Nod GC",
+  "Seanchló Ársa Dubh GC",
+  "Seanchló Ársa GC",
+  "Seanchló Ársa GC Bold",
+  "Tromchlo Beag GC",
+  "Tromchlo Mor GC",
+  "Urchlo GC",
+  "Urchlo GC Bold",
+]
+
+EARLY_LATIN_FONTS=[
+    *FRAKTUR_FONTS,
+    *LATIN_FONTS,
+    # The Wyld font family renders early modern ligatures encoded in the private
+    # unicode area.
+    "Wyld",
+    "Wyld Italic",
+    # Fonts that render the Yogh symbol (U+021C, U+021D) found in Old English.
+    "GentiumAlt",
+]
+
+VIETNAMESE_FONTS=[
+    "Arial Unicode MS Bold",
+    "Arial Bold Italic",
+    "Arial Italic",
+    "Arial Unicode MS",
+    "FreeMono Bold",
+    "Courier New Bold Italic",
+    "FreeMono Italic",
+    "FreeMono",
+    "GentiumAlt Italic",
+    "GentiumAlt",
+    "Palatino Linotype Bold",
+    "Palatino Linotype Bold Italic",
+    "Palatino Linotype Italic",
+    "Palatino Linotype",
+    "Really No 2 LT W2G Light",
+    "Really No 2 LT W2G Light Italic",
+    "Really No 2 LT W2G Medium",
+    "Really No 2 LT W2G Medium Italic",
+    "Really No 2 LT W2G Semi-Bold",
+    "Really No 2 LT W2G Semi-Bold Italic",
+    "Really No 2 LT W2G Ultra-Bold",
+    "Really No 2 LT W2G Ultra-Bold Italic",
+    "Times New Roman, Bold",
+    "Times New Roman, Bold Italic",
+    "Times New Roman, Italic",
+    "Times New Roman,",
+    "Verdana Bold",
+    "Verdana Italic",
+    "Verdana",
+    "Verdana Bold Italic",
+    "VL Gothic",
+    "VL PGothic",
+]
+
+DEVANAGARI_FONTS=[
+    "FreeSans",
+    "Chandas",
+    "Kalimati",
+    "Uttara",
+    "Lucida Sans",
+    "gargi Medium",
+    "Lohit Devanagari",
+    "Arial Unicode MS Bold",
+    "Ascender Uni",
+    "Noto Sans Devanagari Bold",
+    "Noto Sans Devanagari",
+    "Samyak Devanagari Medium",
+    "Sarai",
+    "Saral LT Bold",
+    "Saral LT Light",
+    "Nakula",
+    "Sahadeva",
+    "Samanata",
+    "Santipur OT Medium",
+]
+
+KANNADA_FONTS=[
+    "Kedage Bold",
+    "Kedage Italic",
+    "Kedage",
+    "Kedage Bold Italic",
+    "Mallige Bold",
+    "Mallige Italic",
+    "Mallige",
+    "Mallige Bold Italic",
+    "Arial Unicode MS",
+    "Arial Unicode MS Bold",
+    "Ascender Uni",
+    "cheluvi Medium",
+    "Noto Sans Kannada Bold",
+    "Noto Sans Kannada",
+    "Lohit Kannada",
+    "Tunga",
+    "Tunga Bold",
+]
+
+TELUGU_FONTS=[
+    "Pothana2000",
+    "Vemana2000",
+    "Lohit Telugu",
+    "Arial Unicode MS Bold",
+    "Ascender Uni",
+    "Dhurjati",
+    "Gautami Bold",
+    "Gidugu",
+    "Gurajada",
+    "Lakki Reddy",
+    "Mallanna",
+    "Mandali",
+    "NATS",
+    "NTR",
+    "Noto Sans Telugu Bold",
+    "Noto Sans Telugu",
+    "Peddana",
+    "Ponnala",
+    "Ramabhadra",
+    "Ravi Prakash",
+    "Sree Krushnadevaraya",
+    "Suranna",
+    "Suravaram",
+    "Tenali Ramakrishna",
+    "Gautami",
+]
+
+TAMIL_FONTS=[
+    "TAMu_Kadambri",
+    "TAMu_Kalyani",
+    "TAMu_Maduram",
+    "TSCu_Paranar",
+    "TSCu_Times",
+    "TSCu_Paranar Bold",
+    "FreeSans",
+    "FreeSerif",
+    "Lohit Tamil",
+    "Arial Unicode MS Bold",
+    "Ascender Uni",
+    "Droid Sans Tamil Bold",
+    "Droid Sans Tamil",
+    "Karla Tamil Inclined Bold Italic",
+    "Karla Tamil Inclined Italic",
+    "Karla Tamil Upright Bold",
+    "Karla Tamil Upright",
+    "Noto Sans Tamil Bold",
+    "Noto Sans Tamil",
+    "Noto Sans Tamil UI Bold",
+    "Noto Sans Tamil UI",
+    "TSCu_Comic Normal",
+    "Lohit Tamil Classical",
+]
+
+THAI_FONTS=[
+    "FreeSerif",
+    "FreeSerif Italic",
+    "Garuda",
+    "Norasi",
+    "Lucida Sans Typewriter",
+    "Lucida Sans",
+    "Garuda Oblique",
+    "Norasi Oblique",
+    "Norasi Italic",
+    "Garuda Bold",
+    "Norasi Bold",
+    "Lucida Sans Typewriter Bold",
+    "Lucida Sans Semi-Bold",
+    "Garuda Bold Oblique",
+    "Norasi Bold Italic",
+    "Norasi Bold Oblique",
+    "AnuParp LT Thai",
+    "Arial Unicode MS Bold",
+    "Arial Unicode MS",
+    "Ascender Uni",
+    "Loma",
+    "Noto Serif Thai Bold",
+    "Noto Serif Thai",
+    "Purisa Light",
+    "Sirichana LT Bold",
+    "Sirichana LT",
+    "Sukothai LT Bold",
+    "Sukothai LT",
+    "UtSaHaGumm LT Thai",
+    "Tahoma",
+]
+
+KOREAN_FONTS=[
+    "Arial Unicode MS",
+    "Arial Unicode MS Bold",
+    "Baekmuk Batang Patched",
+    "Baekmuk Batang",
+    "Baekmuk Dotum",
+    "Baekmuk Gulim",
+    "Baekmuk Headline",
+]
+
+CHI_SIM_FONTS=[
+    "AR PL UKai CN",
+    "AR PL UMing Patched Light",
+    "Arial Unicode MS",
+    "Arial Unicode MS Bold",
+    "WenQuanYi Zen Hei Medium",
+]
+
+CHI_TRA_FONTS=[
+    "AR PL UKai TW",
+    "AR PL UMing TW MBE Light",
+    "AR PL UKai Patched",
+    "AR PL UMing Patched Light",
+    "Arial Unicode MS",
+    "Arial Unicode MS Bold",
+    "WenQuanYi Zen Hei Medium",
+]
+
+JPN_FONTS=[
+    "TakaoExGothic",
+    "TakaoExMincho",
+    "TakaoGothic",
+    "TakaoMincho",
+    "TakaoPGothic",
+    "TakaoPMincho",
+    "VL Gothic",
+    "VL PGothic",
+    "Noto Sans Japanese Bold",
+    "Noto Sans Japanese Light",
+]
+
+RUSSIAN_FONTS=[
+    "Arial Bold",
+    "Arial Bold Italic",
+    "Arial Italic",
+    "Arial",
+    "Courier New Bold",
+    "Courier New Bold Italic",
+    "Courier New Italic",
+    "Courier New",
+    "Times New Roman, Bold",
+    "Times New Roman, Bold Italic",
+    "Times New Roman, Italic",
+    "Times New Roman,",
+    "Georgia Bold",
+    "Georgia Italic",
+    "Georgia",
+    "Georgia Bold Italic",
+    "Trebuchet MS Bold",
+    "Trebuchet MS Bold Italic",
+    "Trebuchet MS Italic",
+    "Trebuchet MS",
+    "Verdana Bold",
+    "Verdana Italic",
+    "Verdana",
+    "Verdana Bold Italic",
+    "DejaVu Serif",
+    "DejaVu Serif Oblique",
+    "DejaVu Serif Bold",
+    "DejaVu Serif Bold Oblique",
+    "Lucida Bright",
+    "FreeSerif Bold",
+    "FreeSerif Bold Italic",
+    "DejaVu Sans Ultra-Light",
+]
+
+GREEK_FONTS=[
+    "Arial Unicode MS",
+    "Arial Unicode MS Bold",
+    "DejaVu Sans Mono",
+    "DejaVu Sans Mono Oblique",
+    "DejaVu Sans Mono Bold",
+    "DejaVu Sans Mono Bold Oblique",
+    "DejaVu Serif",
+    "DejaVu Serif Semi-Condensed",
+    "DejaVu Serif Oblique",
+    "DejaVu Serif Bold",
+    "DejaVu Serif Bold Oblique",
+    "DejaVu Serif Bold Semi-Condensed",
+    "FreeSerif Bold",
+    "FreeSerif Bold Italic",
+    "FreeSerif Italic",
+    "FreeSerif",
+    "GentiumAlt",
+    "GentiumAlt Italic",
+    "Linux Biolinum O Bold",
+    "Linux Biolinum O",
+    "Linux Libertine O Bold",
+    "Linux Libertine O",
+    "Linux Libertine O Bold Italic",
+    "Linux Libertine O Italic",
+    "Palatino Linotype Bold",
+    "Palatino Linotype Bold Italic",
+    "Palatino Linotype Italic",
+    "Palatino Linotype",
+    "UmePlus P Gothic",
+    "VL PGothic",
+]
+
+ANCIENT_GREEK_FONTS=[
+    "GFS Artemisia",
+    "GFS Artemisia Bold",
+    "GFS Artemisia Bold Italic",
+    "GFS Artemisia Italic",
+    "GFS Bodoni",
+    "GFS Bodoni Bold",
+    "GFS Bodoni Bold Italic",
+    "GFS Bodoni Italic",
+    "GFS Didot",
+    "GFS Didot Bold",
+    "GFS Didot Bold Italic",
+    "GFS Didot Italic",
+    "GFS DidotClassic",
+    "GFS Neohellenic",
+    "GFS Neohellenic Bold",
+    "GFS Neohellenic Bold Italic",
+    "GFS Neohellenic Italic",
+    "GFS Philostratos",
+    "GFS Porson",
+    "GFS Pyrsos",
+    "GFS Solomos",
+]
+
+ARABIC_FONTS=[
+    "Arabic Transparent Bold",
+    "Arabic Transparent",
+    "Arab",
+    "Arial Unicode MS Bold",
+    "Arial Unicode MS",
+    "ASVCodar LT Bold",
+    "ASVCodar LT Light",
+    "Badiya LT Bold",
+    "Badiya LT",
+    "Badr LT Bold",
+    "Badr LT",
+    "Dimnah",
+    "Frutiger LT Arabic Bold",
+    "Frutiger LT Arabic",
+    "Furat",
+    "Hassan LT Bold",
+    "Hassan LT Light",
+    "Jalal LT Bold",
+    "Jalal LT Light",
+    "Midan Bold",
+    "Midan",
+    "Mitra LT Bold",
+    "Mitra LT Light",
+    "Palatino LT Arabic",
+    "Palatino Sans Arabic Bold",
+    "Palatino Sans Arabic",
+    "Simplified Arabic Bold",
+    "Simplified Arabic",
+    "Times New Roman, Bold",
+    "Times New Roman,",
+    "Traditional Arabic Bold",
+    "Traditional Arabic",
+]
+
+HEBREW_FONTS=[
+    "Arial Bold",
+    "Arial Bold Italic",
+    "Arial Italic",
+    "Arial",
+    "Courier New Bold",
+    "Courier New Bold Italic",
+    "Courier New Italic",
+    "Courier New",
+    "Ergo Hebrew Semi-Bold",
+    "Ergo Hebrew Semi-Bold Italic",
+    "Ergo Hebrew",
+    "Ergo Hebrew Italic",
+    "Really No 2 LT W2G Light",
+    "Really No 2 LT W2G Light Italic",
+    "Really No 2 LT W2G Medium",
+    "Really No 2 LT W2G Medium Italic",
+    "Really No 2 LT W2G Semi-Bold",
+    "Really No 2 LT W2G Semi-Bold Italic",
+    "Really No 2 LT W2G Ultra-Bold",
+    "Really No 2 LT W2G Ultra-Bold Italic",
+    "Times New Roman, Bold",
+    "Times New Roman, Bold Italic",
+    "Times New Roman, Italic",
+    "Times New Roman,",
+    "Lucida Sans",
+    "Tahoma",
+]
+
+BENGALI_FONTS=[
+    "Bangla Medium",
+    "Lohit Bengali",
+    "Mukti Narrow",
+    "Mukti Narrow Bold",
+    "Jamrul Medium Semi-Expanded",
+    "Likhan Medium",
+    "Arial Unicode MS Bold",
+    "Ascender Uni",
+    "FreeSans",
+    "FreeSans Oblique",
+    "FreeSerif",
+    "FreeSerif Italic",
+    "Noto Sans Bengali Bold",
+    "Noto Sans Bengali",
+    "Ani",
+    "Lohit Assamese",
+    "Lohit Bengali",
+    "Mitra Mono",
+]
+
+KYRGYZ_FONTS=[
+    "Arial",
+    "Arial Bold",
+    "Arial Italic",
+    "Arial Bold Italic",
+    "Courier New",
+    "Courier New Bold",
+    "Courier New Italic",
+    "Courier New Bold Italic",
+    "Times New Roman,",
+    "Times New Roman, Bold",
+    "Times New Roman, Bold Italic",
+    "Times New Roman, Italic",
+    "DejaVu Serif",
+    "DejaVu Serif Oblique",
+    "DejaVu Serif Bold",
+    "DejaVu Serif Bold Oblique",
+    "Lucida Bright",
+    "FreeSerif Bold",
+    "FreeSerif Bold Italic",
+]
+
+PERSIAN_FONTS=[
+    "Amiri Bold Italic",
+    "Amiri Bold",
+    "Amiri Italic",
+    "Amiri",
+    "Andale Sans Arabic Farsi",
+    "Arial Unicode MS",
+    "Arial Unicode MS Bold",
+    "Lateef",
+    "Lucida Bright",
+    "Lucida Sans Oblique",
+    "Lucida Sans Semi-Bold",
+    "Lucida Sans",
+    "Lucida Sans Typewriter Bold",
+    "Lucida Sans Typewriter Oblique",
+    "Lucida Sans Typewriter",
+    "Scheherazade",
+    "Tahoma",
+    "Times New Roman,",
+    "Times New Roman, Bold",
+    "Times New Roman, Bold Italic",
+    "Times New Roman, Italic",
+    "Yakout Linotype Bold",
+    "Yakout Linotype",
+]
+
+AMHARIC_FONTS=[
+    "Abyssinica SIL"
+    "Droid Sans Ethiopic Bold",
+    "Droid Sans Ethiopic",
+    "FreeSerif",
+    "Noto Sans Ethiopic Bold",
+    "Noto Sans Ethiopic",
+]
+
+ARMENIAN_FONTS=[
+    "Arial Unicode MS",
+    "Arial Unicode MS Bold",
+    "Ascender Uni",
+    "FreeMono",
+    "FreeMono Italic",
+    "FreeSans",
+    "FreeSans Bold",
+    "FreeSans Oblique",
+]
+
+BURMESE_FONTS=[
+    "Myanmar Sans Pro",
+    "Noto Sans Myanmar Bold",
+    "Noto Sans Myanmar",
+    "Padauk Bold",
+    "Padauk",
+    "TharLon",
+]
+
+JAVANESE_FONTS=[
+    "Prada",
+]
+
+NORTH_AMERICAN_ABORIGINAL_FONTS=[
+    "Aboriginal Sans",
+    "Aboriginal Sans Bold Italic",
+    "Aboriginal Sans Italic",
+    "Aboriginal Sans Bold",
+    "Aboriginal Serif Bold",
+    "Aboriginal Serif Bold Italic",
+    "Aboriginal Serif Italic",
+    "Aboriginal Serif",
+]
+
+GEORGIAN_FONTS=[
+    "Arial Unicode MS Bold",
+    "Arial Unicode MS",
+    "BPG Algeti GPL\&GNU",
+    "BPG Chveulebrivi GPL\&GNU",
+    "BPG Courier GPL\&GNU",
+    "BPG Courier S GPL\&GNU",
+    "BPG DejaVu Sans 2011 GNU-GPL",
+    "BPG Elite GPL\&GNU",
+    "BPG Excelsior GPL\&GNU",
+    "BPG Glaho GPL\&GNU",
+    "BPG Gorda GPL\&GNU",
+    "BPG Ingiri GPL\&GNU",
+    "BPG Mrgvlovani Caps GNU\&GPL",
+    "BPG Mrgvlovani GPL\&GNU",
+    "BPG Nateli Caps GPL\&GNU Light",
+    "BPG Nateli Condenced GPL\&GNU Light",
+    "BPG Nateli GPL\&GNU Light",
+    "BPG Nino Medium Cond GPL\&GNU",
+    "BPG Nino Medium GPL\&GNU Medium",
+    "BPG Sans GPL\&GNU",
+    "BPG Sans Medium GPL\&GNU",
+    "BPG Sans Modern GPL\&GNU",
+    "BPG Sans Regular GPL\&GNU",
+    "BPG Serif GPL\&GNU",
+    "BPG Serif Modern GPL\&GNU",
+    "FreeMono",
+    "FreeMono Bold Italic",
+    "FreeSans",
+    "FreeSerif",
+    "FreeSerif Bold",
+    "FreeSerif Bold Italic",
+    "FreeSerif Italic",
+]
+
+OLD_GEORGIAN_FONTS=[
+    "Arial Unicode MS Bold",
+    "Arial Unicode MS",
+    "BPG Algeti GPL\&GNU",
+    "BPG Courier S GPL\&GNU",
+    "BPG DejaVu Sans 2011 GNU-GPL",
+    "BPG Elite GPL\&GNU",
+    "BPG Excelsior GPL\&GNU",
+    "BPG Glaho GPL\&GNU",
+    "BPG Ingiri GPL\&GNU",
+    "BPG Mrgvlovani Caps GNU\&GPL",
+    "BPG Mrgvlovani GPL\&GNU",
+    "BPG Nateli Caps GPL\&GNU Light",
+    "BPG Nateli Condenced GPL\&GNU Light",
+    "BPG Nateli GPL\&GNU Light",
+    "BPG Nino Medium Cond GPL\&GNU",
+    "BPG Nino Medium GPL\&GNU Medium",
+    "BPG Sans GPL\&GNU",
+    "BPG Sans Medium GPL\&GNU",
+    "BPG Sans Modern GPL\&GNU",
+    "BPG Sans Regular GPL\&GNU",
+    "BPG Serif GPL\&GNU",
+    "BPG Serif Modern GPL\&GNU",
+    "FreeSans",
+    "FreeSerif",
+    "FreeSerif Bold",
+    "FreeSerif Bold Italic",
+    "FreeSerif Italic",
+]
+
+KHMER_FONTS=[
+    "Khmer OS",
+    "Khmer OS System",
+    "Khmer OS Battambang",
+    "Khmer OS Bokor",
+    "Khmer OS Content",
+    "Khmer OS Fasthand",
+    "Khmer OS Freehand",
+    "Khmer OS Metal Chrieng",
+    "Khmer OS Muol Light",
+    "Khmer OS Muol Pali",
+    "Khmer OS Muol",
+    "Khmer OS Siemreap",
+    "Noto Sans Bold",
+    "Noto Sans",
+    "Noto Serif Khmer Bold",
+    "Noto Serif Khmer Light",
+]
+
+KURDISH_FONTS=[
+    "Amiri Bold Italic",
+    "Amiri Bold",
+    "Amiri Italic",
+    "Amiri",
+    "Arial Unicode MS",
+    "Arial Unicode MS Bold",
+    "Lateef",
+    "Lucida Bright",
+    "Lucida Sans Oblique",
+    "Lucida Sans Semi-Bold",
+    "Lucida Sans",
+    "Lucida Sans Typewriter Bold",
+    "Lucida Sans Typewriter Oblique",
+    "Lucida Sans Typewriter",
+    "Scheherazade",
+    "Tahoma",
+    "Times New Roman,",
+    "Times New Roman, Bold",
+    "Times New Roman, Bold Italic",
+    "Times New Roman, Italic",
+    "Unikurd Web",
+    "Yakout Linotype Bold",
+    "Yakout Linotype",
+]
+
+LAOTHIAN_FONTS=[
+    "Phetsarath OT",
+    "Arial Unicode MS",
+    "Arial Unicode MS Bold",
+    "Ascender Uni",
+    "Dhyana Bold",
+    "Dhyana",
+    "Lao Muang Don",
+    "Lao Muang Khong",
+    "Lao Sans Pro",
+    "Noto Sans Lao Bold",
+    "Noto Sans Lao",
+    "Noto Sans Lao UI Bold",
+    "Noto Sans Lao UI",
+    "Noto Serif Lao Bold",
+    "Noto Serif Lao",
+    "Phetsarath Bold",
+    "Phetsarath",
+    "Souliyo Unicode",
+]
+
+GUJARATI_FONTS=[
+    "Lohit Gujarati",
+    "Rekha Medium",
+    "Samyak Gujarati Medium",
+    "aakar Medium",
+    "padmaa Bold",
+    "padmaa Medium",
+    "Arial Unicode MS",
+    "Arial Unicode MS Bold",
+    "Ascender Uni",
+    "FreeSans",
+    "Noto Sans Gujarati Bold",
+    "Noto Sans Gujarati",
+    "Shruti",
+    "Shruti Bold",
+]
+
+MALAYALAM_FONTS=[
+    "AnjaliOldLipi",
+    "Arial Unicode MS",
+    "Arial Unicode MS Bold",
+    "Ascender Uni",
+    "Dyuthi",
+    "FreeSerif",
+    "Kalyani",
+    "Kartika",
+    "Kartika Bold",
+    "Lohit Malayalam",
+    "Meera",
+    "Noto Sans Malayalam Bold",
+    "Noto Sans Malayalam",
+    "Rachana",
+    "Rachana_w01",
+    "RaghuMalayalam",
+    "suruma",
+]
+
+ORIYA_FONTS=[
+    "Arial Unicode MS",
+    "Arial Unicode MS Bold",
+    "Ascender Uni",
+    "ori1Uni Medium",
+    "Samyak Oriya Medium",
+    "Lohit Oriya",
+]
+
+PUNJABI_FONTS=[
+    "Arial Unicode MS",
+    "Arial Unicode MS Bold",
+    "Ascender Uni",
+    "Saab",
+    "Lohit Punjabi",
+    "Noto Sans Gurmukhi",
+    "Noto Sans Gurmukhi Bold",
+    "FreeSans",
+    "FreeSans Bold",
+    "FreeSerif",
+]
+
+SINHALA_FONTS=[
+    "Noto Sans Sinhala Bold",
+    "Noto Sans Sinhala",
+    "OCRUnicode",
+    "Yagpo",
+    "LKLUG",
+    "FreeSerif",
+]
+
+SYRIAC_FONTS=[
+    "East Syriac Adiabene",
+    "East Syriac Ctesiphon",
+    "Estrangelo Antioch",
+    "Estrangelo Edessa",
+    "Estrangelo Midyat",
+    "Estrangelo Nisibin",
+    "Estrangelo Quenneshrin",
+    "Estrangelo Talada",
+    "Estrangelo TurAbdin",
+    "Serto Batnan Bold",
+    "Serto Batnan",
+    "Serto Jerusalem Bold",
+    "Serto Jerusalem Italic",
+    "Serto Jerusalem",
+    "Serto Kharput",
+    "Serto Malankara",
+    "Serto Mardin Bold",
+    "Serto Mardin",
+    "Serto Urhoy Bold",
+    "Serto Urhoy",
+    "FreeSans",
+]
+
+THAANA_FONTS=[
+    "FreeSerif",
+]
+
+TIBETAN_FONTS=[
+    "Arial Unicode MS",
+    "Arial Unicode MS Bold",
+    "Ascender Uni",
+    "DDC Uchen",
+    "Jomolhari",
+    "Kailasa",
+    "Kokonor",
+    "Tibetan Machine Uni",
+    "TibetanTsugRing",
+    "Yagpo",
+]
+
+# The following fonts will be rendered vertically in phase I.
+VERTICAL_FONTS=[
+    "TakaoExGothic",
+    "TakaoExMincho",
+    "AR PL UKai Patched",
+    "AR PL UMing Patched Light",
+    "Baekmuk Batang Patched",
+]
+
+FLAGS_webtext_prefix=os.environ.get('FLAGS_webtext_prefix', '')
+
+# Set language-specific values for several global variables, including
+#   ${TEXT_CORPUS}
+#      holds the text corpus file for the language, used in phase F
+#   ${FONTS[@]}
+#      holds a sequence of applicable fonts for the language, used in
+#      phase F & I. only set if not already set, i.e. from command line
+#   ${TRAINING_DATA_ARGUMENTS}
+#      non-default arguments to the training_data program used in phase T
+#   ${FILTER_ARGUMENTS}[ -]
+#      character-code-specific filtering to distinguish between scripts
+#      (eg. CJK) used by filter_borbidden_characters in phase F
+#   ${WORDLIST2DAWG_ARGUMENTS}
+#      specify fixed length dawg generation for non-space-delimited lang
+# TODO(dsl): We can refactor these into functions that assign FONTS,
+# TEXT_CORPUS, etc. separately.
+def set_lang_specific_parameters(ctx, lang):
+    # The default text location is now given directly from the language code.
+    TEXT_CORPUS=f"{FLAGS_webtext_prefix}/{lang}.corpus.txt"
+    FILTER_ARGUMENTS=[]
+    WORDLIST2DAWG_ARGUMENTS=""
+    # These dawg factors represent the fraction of the corpus not covered by the
+    # dawg, and seem like reasonable defaults, but the optimal value is likely
+    # to be highly corpus-dependent, as well as somewhat language-dependent.
+    # Number dawg factor is the fraction of all numeric strings that are not
+    # covered, which is why it is higher relative to the others.
+    PUNC_DAWG_FACTOR=None
+    NUMBER_DAWG_FACTOR=0.125
+    WORD_DAWG_FACTOR=0.05
+    BIGRAM_DAWG_FACTOR=0.015
+    TRAINING_DATA_ARGUMENTS=[]
+    FRAGMENTS_DISABLED="y"
+    RUN_SHAPE_CLUSTERING=False
+    AMBIGS_FILTER_DENOMINATOR="100000"
+    LEADING=32
+    MEAN_COUNT=40  # Default for latin script.
+    # Language to mix with the language for maximum accuracy. Defaults to eng.
+    # If no language is good, set to the base language.
+    MIX_LANG="eng"
+    FONTS=ctx.fonts
+    TEXT2IMAGE_EXTRA_ARGS=[]
+    EXPOSURES=[]
+
+
+    # Latin languages.
+    if lang == 'enm':
+        TEXT2IMAGE_EXTRA_ARGS+=["--ligatures"]   # Add ligatures when supported
+        if not FONTS: FONTS = EARLY_LATIN_FONTS
+    elif lang == 'frm':
+        TEXT_CORPUS=f"{FLAGS_webtext_prefix}/fra.corpus.txt"
+        # Make long-s substitutions for Middle French text
+        FILTER_ARGUMENTS+=["--make_early_language_variant=fra"]
+        TEXT2IMAGE_EXTRA_ARGS+=["--ligatures"]   # Add ligatures when supported.
+        if not FONTS: FONTS = EARLY_LATIN_FONTS
+    elif lang == 'frk':
+        TEXT_CORPUS=f"{FLAGS_webtext_prefix}/deu.corpus.txt"
+        if not FONTS: FONTS = FRAKTUR_FONTS
+    elif lang == 'ita_old':
+        TEXT_CORPUS=f"{FLAGS_webtext_prefix}/ita.corpus.txt"
+        # Make long-s substitutions for Early Italian text
+        FILTER_ARGUMENTS+=["--make_early_language_variant=ita"]
+        TEXT2IMAGE_EXTRA_ARGS+=["--ligatures"]   # Add ligatures when supported.
+        if not FONTS: FONTS = EARLY_LATIN_FONTS
+    elif lang == 'lat':
+        if not EXPOSURES: EXPOSURES="-3 -2 -1 0 1 2 3".split()
+        if not FONTS: FONTS = NEOLATIN_FONTS
+    elif lang == 'spa_old':
+        TEXT_CORPUS=f"{FLAGS_webtext_prefix}/spa.corpus.txt"
+        # Make long-s substitutions for Early Spanish text
+        FILTER_ARGUMENTS+=["--make_early_language_variant=spa"]
+        TEXT2IMAGE_EXTRA_ARGS+=["--ligatures"]  # Add ligatures when supported.
+        if not FONTS: FONTS = EARLY_LATIN_FONTS
+    elif lang == 'srp_latn':
+        TEXT_CORPUS=f"{FLAGS_webtext_prefix}/srp.corpus.txt"
+    elif lang == 'vie':
+        TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"]
+        if not FONTS: FONTS = VIETNAMESE_FONTS
+        # Highly inflective languages get a bigger dawg size.
+        # TODO(rays) Add more here!
+    elif lang == 'hun':
+        WORD_DAWG_SIZE=1000000
+    elif lang == 'pol':
+        WORD_DAWG_SIZE=1000000
+
+        # Latin with default treatment.
+    elif lang == 'afr':
+        pass
+    elif lang == 'aze':
+        pass
+    elif lang == 'bos':
+        pass
+    elif lang == 'cat':
+        pass
+    elif lang == 'ceb':
+        pass
+    elif lang == 'ces':
+        PUNC_DAWG_FACTOR=0.004
+    elif lang == 'cym':
+        pass
+    elif lang == 'dan':
+        pass
+    elif lang == 'deu':
+        WORD_DAWG_FACTOR=0.125
+    elif lang == 'eng':
+        WORD_DAWG_FACTOR=0.03
+    elif lang == 'epo':
+        pass
+    elif lang == 'est':
+        pass
+    elif lang == 'eus':
+        pass
+    elif lang == 'fil':
+        pass
+    elif lang == 'fin':
+        pass
+    elif lang == 'fra':
+        WORD_DAWG_FACTOR=0.08
+    elif lang == 'gle':
+        pass
+    elif lang == 'gle_uncial':
+        if not FONTS: FONTS = IRISH_UNCIAL_FONTS
+    elif lang == 'glg':
+        pass
+    elif lang == 'hat':
+        pass
+    elif lang == 'hrv':
+        pass
+    elif lang == 'iast':
+        pass
+    elif lang == 'ind':
+        pass
+    elif lang == 'isl':
+        pass
+    elif lang == 'ita':
+        pass
+    elif lang == 'jav':
+        pass
+    elif lang == 'lav':
+        pass
+    elif lang == 'lit':
+        pass
+    elif lang == 'mlt':
+        pass
+    elif lang == 'msa':
+        pass
+    elif lang == 'nld':
+        WORD_DAWG_FACTOR=0.02
+    elif lang == 'nor':
+        pass
+    elif lang == 'por':
+        pass
+    elif lang == 'ron':
+        pass
+    elif lang == 'slk':
+        pass
+    elif lang == 'slv':
+        pass
+    elif lang == 'spa':
+        pass
+    elif lang == 'sqi':
+        pass
+    elif lang == 'swa':
+        pass
+    elif lang == 'swe':
+        pass
+    elif lang == 'tgl':
+        pass
+    elif lang == 'tur':
+        pass
+    elif lang == 'uzb':
+        pass
+    elif lang == 'zlm':
+        pass
+
+        # Special code for performing language-id that is trained on
+        # EFIGS+Latin+Vietnamese text with regular + fraktur fonts.
+    elif lang == 'lat_lid':
+        TEXT_CORPUS=f'{FLAGS_webtext_prefix}/lat_lid.corpus.txt'
+        TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"]
+        GENERATE_WORD_BIGRAMS=0
+        # Strip unrenderable words as not all fonts will render the extended
+        # latin symbols found in Vietnamese text.
+        WORD_DAWG_SIZE=1000000
+        if not FONTS: FONTS = EARLY_LATIN_FONTS
+
+        # Cyrillic script-based languages. It is bad to mix Latin with Cyrillic.
+    elif lang == 'rus':
+        if not FONTS: FONTS = RUSSIAN_FONTS
+        MIX_LANG="rus"
+        NUMBER_DAWG_FACTOR=0.05
+        WORD_DAWG_SIZE=1000000
+    elif lang in ('aze_cyrl','bel','bul','kaz','mkd','srp','tgk','ukr','uzb_cyrl' ):
+        MIX_LANG=f"{lang}"
+        if not FONTS: FONTS = RUSSIAN_FONTS
+
+        # Special code for performing Cyrillic language-id that is trained on
+        # Russian, Serbian, Ukrainian, Belarusian, Macedonian, Tajik and Mongolian
+        # text with the list of Russian fonts.
+    elif lang == 'cyr_lid':
+        TEXT_CORPUS=f'{FLAGS_webtext_prefix}/cyr_lid.corpus.txt'
+        TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"]
+        GENERATE_WORD_BIGRAMS=0
+        WORD_DAWG_SIZE=1000000
+        if not FONTS: FONTS = RUSSIAN_FONTS
+
+        # South Asian scripts mostly have a lot of different graphemes, so trim
+        # down the MEAN_COUNT so as not to get a huge amount of text.
+    elif lang in ('asm','ben' ):
+        MEAN_COUNT=15
+        WORD_DAWG_FACTOR=0.15
+        if not FONTS: FONTS = BENGALI_FONTS
+    elif lang in (   'bih','hin','mar','nep','san' ):
+        MEAN_COUNT=15
+        WORD_DAWG_FACTOR=0.15
+        if not FONTS: FONTS = DEVANAGARI_FONTS
+    elif lang == 'bod':
+        MEAN_COUNT=15
+        WORD_DAWG_FACTOR=0.15
+        if not FONTS: FONTS = TIBETAN_FONTS
+    elif lang == 'dzo':
+        WORD_DAWG_FACTOR=0.01
+        if not FONTS: FONTS = TIBETAN_FONTS
+    elif lang == 'guj':
+        MEAN_COUNT=15
+        WORD_DAWG_FACTOR=0.15
+        if not FONTS: FONTS = GUJARATI_FONTS
+    elif lang == 'kan':
+        MEAN_COUNT=15
+        WORD_DAWG_FACTOR=0.15
+        TRAINING_DATA_ARGUMENTS+=["--no_newline_in_output"]
+        TEXT2IMAGE_EXTRA_ARGS+=["--char_spacing=0.5"]
+        if not FONTS: FONTS = KANNADA_FONTS
+    elif lang == 'mal':
+        MEAN_COUNT=15
+        WORD_DAWG_FACTOR=0.15
+        TRAINING_DATA_ARGUMENTS+=["--no_newline_in_output"]
+        TEXT2IMAGE_EXTRA_ARGS+=["--char_spacing=0.5"]
+        if not FONTS: FONTS = MALAYALAM_FONTS
+    elif lang == 'ori':
+        WORD_DAWG_FACTOR=0.01
+        if not FONTS: FONTS = ORIYA_FONTS
+    elif lang == 'pan':
+        MEAN_COUNT=15
+        WORD_DAWG_FACTOR=0.01
+        if not FONTS: FONTS = PUNJABI_FONTS
+    elif lang == 'sin':
+        MEAN_COUNT=15
+        WORD_DAWG_FACTOR=0.01
+        if not FONTS: FONTS = SINHALA_FONTS
+    elif lang == 'tam':
+        MEAN_COUNT=30
+        WORD_DAWG_FACTOR=0.15
+        TRAINING_DATA_ARGUMENTS+=["--no_newline_in_output"]
+        TEXT2IMAGE_EXTRA_ARGS+=["--char_spacing=0.5"]
+        if not FONTS: FONTS = TAMIL_FONTS
+    elif lang == 'tel':
+        MEAN_COUNT=15
+        WORD_DAWG_FACTOR=0.15
+        TRAINING_DATA_ARGUMENTS+=["--no_newline_in_output"]
+        TEXT2IMAGE_EXTRA_ARGS+=["--char_spacing=0.5"]
+        if not FONTS: FONTS = TELUGU_FONTS
+
+        # SouthEast Asian scripts.
+    elif lang == 'jav_java':
+        MEAN_COUNT=15
+        WORD_DAWG_FACTOR=0.15
+        TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"]
+        if not FONTS: FONTS = JAVANESE_FONTS
+    elif lang == 'khm':
+        MEAN_COUNT=15
+        WORD_DAWG_FACTOR=0.15
+        TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"]
+        if not FONTS: FONTS = KHMER_FONTS
+    elif lang == 'lao':
+        MEAN_COUNT=15
+        WORD_DAWG_FACTOR=0.15
+        TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"]
+        if not FONTS: FONTS = LAOTHIAN_FONTS
+    elif lang == 'mya':
+        MEAN_COUNT=12
+        WORD_DAWG_FACTOR=0.15
+        TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"]
+        if not FONTS: FONTS = BURMESE_FONTS
+    elif lang == 'tha':
+        MEAN_COUNT=30
+        WORD_DAWG_FACTOR=0.01
+        TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"]
+        FILTER_ARGUMENTS+=["--segmenter_lang=tha"]
+        TRAINING_DATA_ARGUMENTS+=["--no_space_in_output", "--desired_bigrams="]
+        AMBIGS_FILTER_DENOMINATOR="1000"
+        LEADING=48
+        if not FONTS: FONTS = THAI_FONTS
+
+        # CJK
+    elif lang == 'chi_sim':
+        MEAN_COUNT=15
+        PUNC_DAWG_FACTOR=0.015
+        WORD_DAWG_FACTOR=0.015
+        GENERATE_WORD_BIGRAMS=0
+        TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"]
+        TRAINING_DATA_ARGUMENTS+=["--no_space_in_output", "--desired_bigrams="]
+        FILTER_ARGUMENTS+=["--charset_filter=chi_sim", "--segmenter_lang=chi_sim"]
+        if not FONTS: FONTS = CHI_SIM_FONTS
+    elif lang == 'chi_tra':
+        MEAN_COUNT=15
+        WORD_DAWG_FACTOR=0.015
+        GENERATE_WORD_BIGRAMS=0
+        TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"]
+        TRAINING_DATA_ARGUMENTS+=["--no_space_in_output", "--desired_bigrams="]
+        FILTER_ARGUMENTS+=["--charset_filter=chi_tr", "--segmenter_lang=chi_tra"]
+        if not FONTS: FONTS = CHI_TRA_FONTS
+    elif lang == 'jpn':
+        MEAN_COUNT=15
+        WORD_DAWG_FACTOR=0.015
+        GENERATE_WORD_BIGRAMS=0
+        TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"]
+        TRAINING_DATA_ARGUMENTS+=["--no_space_in_output", "--desired_bigrams="]
+        FILTER_ARGUMENTS+=["--charset_filter=jpn", "--segmenter_lang=jpn"]
+        if not FONTS: FONTS = JPN_FONTS
+    elif lang == 'kor':
+        MEAN_COUNT=20
+        WORD_DAWG_FACTOR=0.015
+        NUMBER_DAWG_FACTOR=0.05
+        TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"]
+        TRAINING_DATA_ARGUMENTS+=["--desired_bigrams="]
+        GENERATE_WORD_BIGRAMS=0
+        FILTER_ARGUMENTS+=["--charset_filter=kor","--segmenter_lang=kor"]
+        if not FONTS: FONTS = KOREAN_FONTS
+
+        # Middle-Eastern scripts.
+    elif lang == 'ara':
+        if not FONTS: FONTS = ARABIC_FONTS
+    elif lang == 'div':
+        if not FONTS: FONTS = THAANA_FONTS
+    elif lang in ('fas','pus','snd','uig','urd' ):
+        if not FONTS: FONTS = PERSIAN_FONTS
+    elif lang in ('heb','yid' ):
+        NUMBER_DAWG_FACTOR=0.05
+        WORD_DAWG_FACTOR=0.08
+        if not FONTS: FONTS = HEBREW_FONTS
+    elif lang == 'syr':
+        if not FONTS: FONTS = SYRIAC_FONTS
+
+        # Other scripts.
+    elif lang in ('amh','tir'):
+        if not FONTS: FONTS = AMHARIC_FONTS
+    elif lang == 'chr':
+        if not FONTS:
+            FONTS = [*NORTH_AMERICAN_ABORIGINAL_FONTS, "Noto Sans Cherokee"]
+    elif lang == 'ell':
+        NUMBER_DAWG_FACTOR=0.05
+        WORD_DAWG_FACTOR=0.08
+        if not FONTS: FONTS = GREEK_FONTS
+    elif lang == 'grc':
+        if not EXPOSURES: EXPOSURES="-3 -2 -1 0 1 2 3".split()
+        if not FONTS: FONTS = ANCIENT_GREEK_FONTS
+    elif lang == 'hye':
+        if not FONTS: FONTS = ARMENIAN_FONTS
+    elif lang == 'iku':
+        if not FONTS: FONTS = NORTH_AMERICAN_ABORIGINAL_FONTS
+    elif lang == 'kat':
+        if not FONTS: FONTS = GEORGIAN_FONTS
+    elif lang == 'kat_old':
+        TEXT_CORPUS=f"{FLAGS_webtext_prefix}/kat.corpus.txt"
+        if not FONTS: FONTS = OLD_GEORGIAN_FONTS
+    elif lang == 'kir':
+        if not FONTS: FONTS = KYRGYZ_FONTS
+        TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=100"]
+    elif lang == 'kur':
+        if not FONTS: FONTS = KURDISH_FONTS
+    else:
+        raise ValueError(f"Error: {lang} is not a valid language code")
+
+
+    FLAGS_mean_count = int(os.environ.get('FLAGS_mean_count', -1))
+    if FLAGS_mean_count > 0:
+        TRAINING_DATA_ARGUMENTS+=[f"--mean_count={FLAGS_mean_count}"]
+    elif not MEAN_COUNT:
+        TRAINING_DATA_ARGUMENTS+=[f"--mean_count={MEAN_COUNT}"]
+
+    # Default to Latin fonts if none have been set
+    if not FONTS: FONTS = LATIN_FONTS
+
+    # Default to 0 exposure if it hasn't been set
+    if not EXPOSURES: EXPOSURES=[0]
+    # Set right-to-left and normalization mode.
+    if lang in ('ara','div', 'fas','pus','snd','syr','uig','urd','kur_ara','heb','yid'):
+        LANG_IS_RTL=True
+        NORM_MODE=2
+    elif lang in (
+        'asm','ben','bih','hin','mar','nep','guj','kan','mal','tam','tel','pan',
+        'dzo','sin','san','bod','ori','khm','mya','tha','lao','jav ','jav_java'
+        ):
+        LANG_IS_RTL=False
+        NORM_MODE=2
+    else:
+        LANG_IS_RTL=False
+        NORM_MODE=1
+
+    for var in [v for v in locals()]:
+        if var.isupper():
+            value = locals()[var]
+            lowervar = var.lower()
+            if hasattr(ctx, lowervar) and getattr(ctx, lowervar) != value:
+                log.debug(f"{lowervar} = {value} (was {getattr(ctx, lowervar)})")
+                setattr(ctx, lowervar, value)
+            elif hasattr(ctx, lowervar):
+                log.debug(f"{lowervar} = {value} (set on cmdline)")
+            else:
+                log.debug(f"{lowervar} = {value}")
+                setattr(ctx, lowervar, value)
+
+    return ctx
+
+#=============================================================================
+# END of Language specific info
+#=============================================================================
diff --git a/src/training/tesstrain.py b/src/training/tesstrain.py
new file mode 100644
index 0000000000..a6aa6276b9
--- /dev/null
+++ b/src/training/tesstrain.py
@@ -0,0 +1,92 @@
+# (C) Copyright 2014, Google Inc.
+# (C) Copyright 2018, James R Barlow
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This script provides an easy way to execute various phases of training
+# Tesseract.  For a detailed description of the phases, see
+# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
+#
+import sys,os,subprocess, logging
+
+
+sys.path.insert(0, os.path.dirname(__file__))
+from tesstrain_utils import parse_flags, initialize_fontconfig, phase_I_generate_image, \
+  phase_UP_generate_unicharset, phase_E_extract_features, make_lstmdata, cleanup
+import language_specific
+
+log = logging.getLogger()
+
+def setup_logging(logfile):
+    log.setLevel(logging.DEBUG)
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+    console_formatter = logging.Formatter(
+      '[%(asctime)s] %(levelname)s - %(message)s',
+      datefmt='%H:%M:%S'
+    )
+    console.setFormatter(console_formatter)
+    log.addHandler(console)
+
+    logfile = logging.FileHandler(logfile)
+    logfile.setLevel(logging.DEBUG)
+    logfile_formatter = logging.Formatter(
+      '[%(asctime)s] - %(levelname)s - %(name)s - %(message)s'
+    )
+    logfile.setFormatter(logfile_formatter)
+    log.addHandler(logfile)
+
+def main():
+    ctx = parse_flags()
+    setup_logging(ctx.log_file)
+    if not ctx.linedata:
+        log.error('--linedata_only is required since only LSTM is supported')
+        sys.exit(1)
+
+    log.info(f"=== Starting training for language {ctx.lang_code}")
+    ctx = language_specific.set_lang_specific_parameters(ctx, ctx.lang_code)
+
+    initialize_fontconfig(ctx)
+    phase_I_generate_image(ctx, par_factor=8)
+    phase_UP_generate_unicharset(ctx)
+
+    if ctx.linedata:
+        phase_E_extract_features(ctx, ['--psm', '6', 'lstm.train'], 'lstmf')
+        make_lstmdata(ctx)
+
+    cleanup(ctx)
+    log.info("All done!")
+    return 0
+
+if __name__ == '__main__':
+    main()
+
+
+# _rc0 = subprocess.call(["tlog","\n=== Starting training for language '"+str(LANG_CODE.val)+"'"],shell=True)
+# _rc0 = subprocess.call(["source",os.popen("dirname "+__file__).read().rstrip("\n")+"/language-specific.sh"],shell=True)
+# _rc0 = subprocess.call(["set_lang_specific_parameters",str(LANG_CODE.val)],shell=True)
+# _rc0 = subprocess.call(["initialize_fontconfig"],shell=True)
+# _rc0 = subprocess.call(["phase_I_generate_image","8"],shell=True)
+# _rc0 = subprocess.call(["phase_UP_generate_unicharset"],shell=True)
+# if (LINEDATA ):
+     #subprocess.call(["phase_E_extract_features"," --psm 6  lstm.train ","8","lstmf"],shell=True)
+#     subprocess.call(["make__lstmdata"],shell=True)
+#     subprocess.call(["tlog","\nCreated starter traineddata for language '"+str(LANG_CODE.val)+"'\n"],shell=True)
+#     subprocess.call(["tlog","\nRun lstmtraining to do the LSTM training for language '"+str(LANG_CODE.val)+"'\n"],shell=True)
+# else:
+#     subprocess.call(["phase_D_generate_dawg"],shell=True)
+#     subprocess.call(["phase_E_extract_features","box.train","8","tr"],shell=True)
+#     subprocess.call(["phase_C_cluster_prototypes",str(TRAINING_DIR.val)+"/"+str(LANG_CODE.val)+".normproto"],shell=True)
+#     if (str(ENABLE_SHAPE_CLUSTERING.val) == "y" ):
+#         subprocess.call(["phase_S_cluster_shapes"],shell=True)
+#     subprocess.call(["phase_M_cluster_microfeatures"],shell=True)
+#     subprocess.call(["phase_B_generate_ambiguities"],shell=True)
+#     subprocess.call(["make__traineddata"],shell=True)
+#     subprocess.call(["tlog","\nCompleted training for language '"+str(LANG_CODE.val)+"'\n"],shell=True)
diff --git a/src/training/tesstrain_utils.py b/src/training/tesstrain_utils.py
new file mode 100644
index 0000000000..d7acff8bfc
--- /dev/null
+++ b/src/training/tesstrain_utils.py
@@ -0,0 +1,617 @@
+# (C) Copyright 2014, Google Inc.
+# (C) Copyright 2018, James R Barlow
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# For a detailed description of the phases, see
+# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
+#
+
+import os
+import sys
+from datetime import date
+from tempfile import TemporaryDirectory, mkdtemp
+from pathlib import Path
+from shutil import which
+import logging
+import subprocess
+import argparse
+from operator import itemgetter
+import concurrent.futures
+import shutil
+import atexit
+
+from tqdm import tqdm
+
+from language_specific import VERTICAL_FONTS
+
+log = logging.getLogger(__name__)
+
+class TrainingArgs(argparse.Namespace):
+    def __init__(self):
+        self.uname = os.uname().sysname.lower()
+        self.lang_code="eng"
+        self.timestamp=str(date.today())
+
+        self._font_config_cache = TemporaryDirectory(prefix='font_tmp')
+        self.font_config_cache =self._font_config_cache.name
+        self.fonts_dir="/Library/Fonts/" if 'darwin' in self.uname else "/usr/share/fonts/"
+
+        self.max_pages=0
+        self.save_box_tiff=False
+        self.output_dir="/tmp/tesstrain/tessdata"
+        self.overwrite=False
+        self.linedata=False
+        self.run_shape_clustering=False
+        self.extract_font_properties=True
+        self._workspace_dir=TemporaryDirectory(prefix='tesstrain')
+        self.workspace_dir = self._workspace_dir.name
+
+
+def err_exit(msg):
+    log.critical(msg)
+    sys.exit(1)
+
+# Helper function to run a command and append its output to a log. Aborts early
+# if the program file is not found.
+# Usage: run_command CMD ARG1 ARG2...
+def run_command(cmd, *args, env=None):
+    for d in ('', 'api/', 'training/'):
+        testcmd = which(f'{d}{cmd}')
+        if which(testcmd):
+            cmd = testcmd
+            break
+    if not which(cmd):
+        err_exit(f"{cmd} not found")
+
+    log.debug(f"Running {cmd}")
+    for arg in args:
+        log.debug(arg)
+
+    proc = subprocess.run([cmd, *args], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env)
+    proclog = logging.getLogger(cmd)
+    if proc.returncode == 0:
+        proclog.debug(proc.stdout.decode('utf-8', errors='replace'))
+    else:
+        try:
+            proclog.error(proc.stdout.decode('utf-8', errors='replace'))
+        except Exception:
+            pass
+        err_exit(f"Program {cmd} failed with return code {proc.returncode}. Abort.")
+
+
+# Check if all the given files exist, or exit otherwise.
+# Used to check required input files and produced output files in each phase.
+# Usage: check_file_readable FILE1 FILE2...
+def check_file_readable(*filenames):
+    if isinstance(filenames, (str, Path)):
+        filenames = [filenames]
+    for filename in filenames:
+        try:
+            with Path(filename).open() as f:
+                pass
+        except FileNotFoundError:
+            err_exit(f"Expected file {filename} does not exist")
+        except PermissionError:
+            err_exit(f"{filename} is not readable")
+        except IOError as e:
+            err_exit(f"{filename} IO Error: {str(e)}")
+    return True
+
+
+
+parser = argparse.ArgumentParser(
+    epilog="""
+    The font names specified in --fontlist need to be recognizable by Pango using
+    fontconfig. An easy way to list the canonical names of all fonts available on
+    your system is to run text2image with --list_available_fonts and the
+    appropriate --fonts_dir path.
+    """,
+)
+parser.add_argument('--fontlist', dest='fonts', nargs='+', type=str, help='A list of fontnames to train on.')
+parser.add_argument('--fonts_dir', help='Path to font files.')
+parser.add_argument('--lang', metavar='LANG_CODE', dest='lang_code', help='ISO 639 code.')
+parser.add_argument('--langdata_dir', metavar='DATADIR', help='Path to tesseract/training/langdata directory.')
+parser.add_argument('--maxpages', type=int, dest='max_pages')
+parser.add_argument('--output_dir', metavar='OUTPUTDIR', help='Location of output traineddata file.')
+parser.add_argument('--overwrite', action='store_true', help='Safe to overwrite files in output_dir.')
+parser.add_argument('--save_box_tiff', action='store_true', help='Save box/tiff pairs along with lstmf files.')
+parser.add_argument('--linedata_only', dest='linedata', action='store_true', help='Only generate training data for lstmtraining.')
+
+inputdata_group = parser.add_argument_group('inputdata', 'OPTIONAL flags for input data. If unspecified we will look for them in the langdata_dir directory.')
+inputdata_group.add_argument('--training_text', metavar='TEXTFILE',help='Text to render and use for training.')
+inputdata_group.add_argument('--wordlist', dest='wordlist_file', metavar='WORDFILE', help='Word list for the language ordered by decreasing frequency.')
+
+parser.add_argument('--extract_font_properties', action='store_true')
+parser.add_argument('--noextract_font_properties', dest='extract_font_properties', action='store_false')
+
+tessdata_group = parser.add_argument_group('tessdata', 'OPTIONAL flag to specify location of existing traineddata files, required during feature extraction. If unspecified will use TESSDATA_PREFIX defined in the current environment.')
+tessdata_group.add_argument('--tessdata_dir', metavar='TESSDATADIR', help='Path to tesseract/tessdata directory.')
+
+parser.add_argument('--exposures', metavar='EXPOSURES', action='append', nargs='+', help='A list of exposure levels to use (e.g. -1,0,1).')
+parser.add_argument('--workspace_dir')
+
+
+# Does simple command-line parsing and initialization.
+def parse_flags(argv=None):
+    ctx =TrainingArgs()
+    log.debug(ctx)
+    parser.parse_args(args=argv, namespace=ctx)
+    log.debug(ctx)
+    log.info("Parsing")
+
+    if not ctx.lang_code:
+        err_exit("Need to specify a language --lang")
+    if not ctx.langdata_dir:
+        err_exit("Need to specify path to language files --langdata_dir")
+    if not ctx.tessdata_dir:
+        tessdata_prefix=os.environ.get('TESSDATA_PREFIX', '')
+        if not tessdata_prefix:
+            err_exit("Need to specify a --tessdata_dir or have a "
+                     "TESSDATA_PREFIX variable defined in your environment")
+        else:
+            ctx.tessdata_dir = tessdata_prefix
+
+    # Location where intermediate files will be created.
+    ctx.training_dir = mkdtemp(prefix=f"{ctx.lang_code}-{ctx.timestamp}")
+    # Location of log file for the whole run.
+    ctx.log_file = Path(ctx.training_dir) / "tesstrain.log"
+    log.info(f"Log file {ctx.log_file}")
+
+    def show_tmpdir_location(training_dir):
+        # On successful exit we will delete this first; on failure we want to let the user
+        # know where the log is
+        if Path(training_dir).exists():
+            print(f"Temporary files retained at: {training_dir}")
+    atexit.register(show_tmpdir_location, ctx.training_dir)
+
+    # Take training text and wordlist from the langdata directory if not
+    # specified in the command-line.
+    if not ctx.training_text:
+        ctx.training_text = Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text"
+    if not ctx.wordlist_file:
+        ctx.wordlist_file = Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist"
+
+    ctx.word_bigrams_file=Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams"
+    ctx.numbers_file=Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers"
+    ctx.punc_file=Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.punc"
+    ctx.bigram_freqs_file=Path(ctx.training_text).with_suffix(".training_text.bigram_freqs")
+    ctx.unigram_freqs_file=Path(ctx.training_text).with_suffix( ".training_text.unigram_freqs")
+    ctx.train_ngrams_file=Path(ctx.training_text).with_suffix( ".training_text.train_ngrams")
+    ctx.generate_dawgs=1
+    log.debug(ctx)
+    return ctx
+
+
+def cleanup(ctx):
+    shutil.copy(ctx.log_file, ctx.output_dir)
+    shutil.rmtree(ctx.training_dir)
+    return
+
+# Function initializes font config with a unique font cache dir.
+def initialize_fontconfig(ctx):
+    sample_path=Path(ctx.font_config_cache)/'sample_text.txt'
+    Path(sample_path).write_text('Text\n')
+    log.info(f"Testing font: {ctx.fonts[0]}")
+    run_command(
+        'text2image', f'--fonts_dir={ctx.fonts_dir}',
+        f"--font={ctx.fonts[0]}", f"--outputbase={sample_path}", f"--text={sample_path}",
+        f"--fontconfig_tmpdir={ctx.font_config_cache}"
+    )
+
+
+def make_fontname(font):
+    return font.replace(' ', '_').replace(',', '')
+
+def make_outbase(ctx, fontname,exposure):
+    return Path(ctx.training_dir)/f"{ctx.lang_code}.{fontname}.exp{exposure}"
+
+# Helper function for phaseI_generate_image. Generates the image for a single
+# language/font combination in a way that can be run in parallel.
+def generate_font_image(ctx, font, exposure, char_spacing):
+
+    log.info(f"Rendering using {font}")
+    fontname=make_fontname(font)
+    outbase=make_outbase(ctx, fontname, exposure)
+
+    common_args=[
+        f"--fontconfig_tmpdir={ctx.font_config_cache}",
+        f"--fonts_dir={ctx.fonts_dir}",
+        f"--strip_unrenderable_words",
+        f"--leading={ctx.leading}",
+        f"--char_spacing={char_spacing}",
+        f"--exposure={exposure}",
+        f"--outputbase={outbase}",
+        f"--max_pages={ctx.max_pages}",
+    ]
+
+    # add --writing_mode=vertical-upright to common_args if the font is
+    # specified to be rendered vertically.
+    if font in VERTICAL_FONTS:
+        common_args.append('--writing_mode=vertical-upright')
+
+    run_command(
+        'text2image',
+        *common_args,
+        f"--font={font}",
+        f"--text={ctx.training_text}",
+        *ctx.text2image_extra_args
+    )
+
+    check_file_readable(str(outbase) + '.box', str(outbase) + '.tif')
+
+    if ctx.extract_font_properties and Path(ctx.train_ngrams_file).exists():
+        log.info(f"Extracting font properties of {font}")
+        run_command(
+            'text2image',
+            *common_args,
+            f"--font={font}",
+            f"--ligatures=false",
+            f"--text={ctx.train_ngrams_file}",
+            f"--only_extract_font_properties",
+            f"--ptsize=32"
+        )
+        check_file_readable(str(outbase) + '.fontinfo')
+    return f'{font}-{exposure}'
+
+# Phase I : Generate (I)mages from training text for each font.
+def phase_I_generate_image(ctx, par_factor):
+
+    if not par_factor or par_factor <= 0:
+        par_factor = 1
+
+    log.info("=== Phase I: Generating training images ===")
+    check_file_readable(ctx.training_text)
+    char_spacing=0.0
+
+    for exposure in ctx.exposures:
+        if ctx.extract_font_properties and Path(ctx.bigram_freqs_file).exists():
+            # Parse .bigram_freqs file and compose a .train_ngrams file with text
+            # for tesseract to recognize during training. Take only the ngrams whose
+            # combined weight accounts for 95% of all the bigrams in the language.
+            lines = Path(ctx.bigram_freqs_file).read_text(encoding='utf-8').split('\n')
+            records = (line.split(' ') for line in splittable_lines)
+            p = .99
+            ngram_frac = p * sum(int(rec[1]) for rec in records if len(rec) >= 2)
+
+            with Path(ctx.train_ngrams_file).open('w', encoding='utf-8') as f:
+                cumsum = 0
+                for bigram, count in sorted(records, key=itemgetter(1), reverse=True):
+                    if cumsum > ngram_frac:
+                        break
+                    f.write(bigram + ' ')
+                    cumsum += count
+
+            check_file_readable(ctx.train_ngrams_file)
+
+        with tqdm(total=len(ctx.fonts)) as pbar, \
+                concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(generate_font_image, ctx, font, exposure, char_spacing)
+                for font in ctx.fonts
+            ]
+            for future in concurrent.futures.as_completed(futures):
+                try:
+                    future.result()
+                except Exception as exc:
+                    err_exit("Failed while generating images " + str(exc))
+                else:
+                    pbar.update(1)
+
+        # Check that each process was successful.
+        for font in ctx.fonts:
+            fontname=make_fontname(font)
+            outbase=make_outbase(ctx, fontname, exposure)
+            check_file_readable(str(outbase) + '.box', str(outbase) + '.tif')
+    return
+
+
+
+# Phase UP : Generate (U)nicharset and (P)roperties file.
+def phase_UP_generate_unicharset(ctx):
+    log.info("=== Phase UP: Generating unicharset and unichar properties files ===")
+
+    box_files=Path(ctx.training_dir).glob('*.box')
+
+    ctx.unicharset_file=Path(ctx.training_dir) / f'{ctx.lang_code}.unicharset'
+
+    run_command(
+        'unicharset_extractor',
+        '--output_unicharset', f"{ctx.unicharset_file}",
+        '--norm_mode', f"{ctx.norm_mode}",
+        *box_files
+    )
+    check_file_readable(ctx.unicharset_file)
+
+    ctx.xheights_file=Path(ctx.training_dir) / f'{ctx.lang_code}.xheights'
+    run_command(
+        'set_unicharset_properties',
+        '-U', f'{ctx.unicharset_file}',
+        '-O', f'{ctx.unicharset_file}',
+        '-X', f'{ctx.xheights_file}',
+        f'--script_dir={ctx.langdata_dir}'
+    )
+    check_file_readable(ctx.xheights_file)
+
+
+# # Phase D : Generate (D)awg files from unicharset file and wordlist files
+# phase_D_generate_dawg() {
+#     tlog "\n=== Phase D: Generating Dawg files ==="
+
+#     # Skip if requested
+#     if [[ ${GENERATE_DAWGS} -eq 0 ]]; then
+#       tlog "Skipping ${phase_name}"
+#       return
+#     fi
+
+#     # Output files
+#     WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg
+#     FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg
+#     PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg
+#     NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg
+#     BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg
+
+#     # Word DAWG
+#     local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq
+#     if [[ -s ${WORDLIST_FILE} ]]; then
+#         tlog "Generating word Dawg"
+#         check_file_readable ${unicharset_file}
+#         run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
+#             ${UNICHARSET_FILE}
+#         check_file_readable ${WORD_DAWG}
+
+#         FREQ_DAWG_SIZE=100
+#         head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file}
+#     fi
+
+#     # Freq-word DAWG
+#     if [[ -s ${freq_wordlist_file} ]]; then
+#         check_file_readable ${UNICHARSET_FILE}
+#         tlog "Generating frequent-word Dawg"
+#         run_command wordlist2dawg  -r 1 ${freq_wordlist_file} \
+#             ${FREQ_DAWG} ${UNICHARSET_FILE}
+#         check_file_readable ${FREQ_DAWG}
+#     fi
+
+#     # Punctuation DAWG
+#     # -r arguments to wordlist2dawg denote RTL reverse policy
+#     # (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h).
+#     # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
+#     # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
+#     # 2/RRP_FORCE_REVERSE for the punctuation DAWG.
+#     local punc_reverse_policy=0;
+#     if [[ "${LANG_IS_RTL}" == "1" ]]; then
+#       punc_reverse_policy=2
+#     fi
+#     if [[ ! -s ${PUNC_FILE} ]]; then
+#         PUNC_FILE="{ctx.langdata_dir}/common.punc"
+#     fi
+#     check_file_readable ${PUNC_FILE}
+#     run_command wordlist2dawg -r ${punc_reverse_policy} \
+#         ${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE}
+#     check_file_readable ${PUNC_DAWG}
+
+#     # Numbers DAWG
+#     if [[ -s ${NUMBERS_FILE} ]]; then
+#         run_command wordlist2dawg -r 0 \
+#             ${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE}
+#         check_file_readable ${NUMBER_DAWG}
+#     fi
+
+#     # Bigram dawg
+#     if [[ -s ${WORD_BIGRAMS_FILE} ]]; then
+#         run_command wordlist2dawg -r 1 \
+#             ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
+#         check_file_readable ${BIGRAM_DAWG}
+#     fi
+# }
+
+# Phase E : (E)xtract .tr feature files from .tif/.box files
+def phase_E_extract_features(ctx, box_config, ext):
+    log.info(f"=== Phase E: Generating {ext} files ===")
+
+    img_files=list(Path(ctx.training_dir).glob('*.exp*.tif'))
+    log.debug(img_files)
+
+    # Use any available language-specific configs.
+    config=""
+    testconfig = Path(ctx.langdata_dir) / ctx.lang_code / f'{ctx.lang_code}.config'
+    if testconfig.exists():
+        config = testconfig
+        log.info(f"Using {ctx.lang_code}.config")
+
+    tessdata_environ = os.environ.copy()
+    tessdata_environ['TESSDATA_PREFIX'] = str(ctx.tessdata_dir)
+
+    log.info(f"Using TESSDATA_PREFIX={tessdata_environ['TESSDATA_PREFIX']}")
+
+    with tqdm(total=len(img_files)) as pbar, \
+            concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
+        futures = []
+        for img_file in img_files:
+            future = executor.submit(
+                run_command,
+                'tesseract',
+                img_file,
+                Path(img_file).with_suffix(''),
+                *box_config,
+                config,
+                env=tessdata_environ
+            )
+            futures.append(future)
+
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                future.result()
+            except Exception as exc:
+                err_exit("Failed while extracting features: " + str(exc))
+            else:
+                pbar.update(1)
+    # Check that all the output files were produced.
+    for img_file in img_files:
+        check_file_readable(Path(img_file.with_suffix('.' + ext)))
+
+    return
+
+# # Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining)
+# # phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto
+# phase_C_cluster_prototypes() {
+#     tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
+#     local out_normproto=$1
+
+#     run_command cntraining -D "${TRAINING_DIR}/" \
+#         $(ls ${TRAINING_DIR}/*.tr)
+
+#     check_file_readable ${TRAINING_DIR}/normproto
+#     mv ${TRAINING_DIR}/normproto ${out_normproto}
+# }
+
+# # Phase S : (S)hape clustering
+# phase_S_cluster_shapes() {
+#     if ((! RUN_SHAPE_CLUSTERING)); then
+#         tlog "\n=== Shape Clustering disabled ==="
+#         return
+#     fi
+#     check_file_readable {ctx.langdata_dir}/font_properties
+#     local font_props="-F {ctx.langdata_dir}/font_properties"
+#     if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\
+#        [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
+#         font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
+#     fi
+
+#     run_command shapeclustering \
+#         -D "${TRAINING_DIR}/" \
+#         -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
+#         -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
+#         ${font_props} \
+#         $(ls ${TRAINING_DIR}/*.tr)
+#     check_file_readable ${TRAINING_DIR}/shapetable \
+#         ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
+# }
+
+# # Phase M : Clustering microfeatures (mfTraining)
+# phase_M_cluster_microfeatures() {
+#     tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ==="
+
+#     check_file_readable {ctx.langdata_dir}/font_properties
+#     font_props="-F {ctx.langdata_dir}/font_properties"
+#     if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \
+#        [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then
+#         font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
+#     fi
+
+#     run_command mftraining \
+#         -D "${TRAINING_DIR}/" \
+#         -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
+#         -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
+#         ${font_props} \
+#         $(ls ${TRAINING_DIR}/*.tr)
+#     check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \
+#         ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset
+#     mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp
+#     mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable
+#     mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable
+#     mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset
+# }
+
+# phase_B_generate_ambiguities() {
+#   tlog "\n=== Phase B : ambiguities training ==="
+
+#   # Check for manually created ambiguities data.
+#   if [[ -r {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then
+#       tlog "Found file {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs"
+#       cp {ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}.unicharambigs \
+#           ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
+#       # Make it writable, as it may be read-only in the client.
+#       chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs
+#       return
+#   else
+#       tlog "No unicharambigs file found!"
+#   fi
+
+#   # TODO: Add support for generating ambiguities automatically.
+# }
+
+def make_lstmdata(ctx):
+    log.info("=== Constructing LSTM training data ===")
+    lang_prefix=f"{ctx.langdata_dir}/{ctx.lang_code}/{ctx.lang_code}"
+    path_output = Path(ctx.output_dir)
+    if not path_output.is_dir():
+        log.info(f"Creating new directory {ctx.output_dir}")
+        path_output.mkdir(exist_ok=True, parents=True)
+
+    args = []
+    if ctx.lang_is_rtl:
+        args.append("--lang_is_rtl")
+    if ctx.norm_mode >= 2:
+        args.append("--pass_through_recoder")
+
+    # Build the starter traineddata from the inputs.
+    run_command(
+        'combine_lang_model',
+        '--input_unicharset', f"{ctx.training_dir}/{ctx.lang_code}.unicharset",
+        '--script_dir', f"{ctx.langdata_dir}",
+        '--words', f"{lang_prefix}.wordlist",
+        '--numbers', f"{lang_prefix}.numbers",
+        '--puncs', f"{lang_prefix}.punc",
+        '--output_dir', f"{ctx.output_dir}",
+        '--lang', f"{ctx.lang_code}",
+        *args
+    )
+
+    def get_file_list():
+        training_path = Path(ctx.training_dir)
+        if ctx.save_box_tiff:
+            log.info("=== Saving box/tiff pairs for training data ===")
+            yield from training_path.glob(f'{ctx.lang_code}*.box')
+            yield from training_path.glob(f'{ctx.lang_code}*.tif')
+        log.info("=== Moving lstmf files for training data ===")
+        yield from training_path.glob(f'{ctx.lang_code}.*.lstmf')
+
+    for f in get_file_list():
+        log.debug(f"Moving {f} to {path_output / f.name}")
+        shutil.move(str(f), path_output / f.name)
+
+    lstm_list=f"{ctx.output_dir}/{ctx.lang_code}.training_files.txt"
+    dir_listing = (str(p) for p in path_output.glob(f'{ctx.lang_code}.*.lstmf'))
+    Path(lstm_list).write_text('\n'.join(dir_listing))
+
+
+# make__traineddata() {
+#   tlog "\n=== Making final traineddata file ==="
+#   local lang_prefix={ctx.langdata_dir}/${LANG_CODE}/${LANG_CODE}
+
+#   # Combine available files for this language from the langdata dir.
+#   if [[ -r ${lang_prefix}.config ]]; then
+#     tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}"
+#     cp ${lang_prefix}.config ${TRAINING_DIR}
+#     chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config
+#   fi
+#   if [[ -r ${lang_prefix}.params-model ]]; then
+#     tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}"
+#     cp ${lang_prefix}.params-model ${TRAINING_DIR}
+#     chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model
+#   fi
+
+#   # Compose the traineddata file.
+#   run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}.
+
+#   # Copy it to the output dir, overwriting only if allowed by the cmdline flag.
+#   if [[ ! -d ${OUTPUT_DIR} ]]; then
+#       tlog "Creating new directory ${OUTPUT_DIR}"
+#       mkdir -p ${OUTPUT_DIR}
+#   fi
+#   local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata;
+#   if [[ -f ${destfile} ]] && ((! OVERWRITE)); then
+#       err_exit "File ${destfile} exists and no --overwrite specified";
+#   fi
+#   tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}"
+#   cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile}
+# }