Add low accuracy mode (#17)

pemistahl · Nov 14, 2022 · 8508880 · 8508880
1 parent 64a3654
commit 8508880
Show file tree

Hide file tree

Showing 207 changed files with 1,812 additions and 513 deletions.
diff --git a/builder.go b/builder.go
@@ -89,7 +89,7 @@ type LanguageDetectorBuilder interface {
 	// dependent on the length of the input text. The longer the input
 	// text, the larger the distance between the languages. So if you
 	// want to classify very short text phrases, do not set the minimum
-	// relative distance too high. Otherwise you will get most results
+	// relative distance too high. Otherwise, you will get most results
 	// returned as Unknown which is the return value for cases
 	// where language detection is not reliably possible.
 	//
@@ -107,6 +107,20 @@ type LanguageDetectorBuilder interface {
 	// method allows to switch between these two loading modes.
 	WithPreloadedLanguageModels() LanguageDetectorBuilder
 
+	// WithLowAccuracyMode disables the high accuracy mode in order to save
+	// memory and increase performance.
+	//
+	// By default, Lingua's high detection accuracy comes at the cost of
+	// loading large language models into memory which might not be feasible
+	// for systems running low on resources.
+	//
+	// This method disables the high accuracy mode so that only a small subset
+	// of language models is loaded into memory. The downside of this approach
+	// is that detection accuracy for short texts consisting of less than 120
+	// characters will drop significantly. However, detection accuracy for texts
+	// which are longer than 120 characters will remain mostly unaffected.
+	WithLowAccuracyMode() LanguageDetectorBuilder
+
 	// Build creates and returns the configured instance of LanguageDetector.
 	Build() LanguageDetector
 	getLanguages() []Language
@@ -117,6 +131,7 @@ type languageDetectorBuilder struct {
 	languages                     []Language
 	minimumRelativeDistance       float64
 	isEveryLanguageModelPreloaded bool
+	isLowAccuracyModeEnabled      bool
 }
 
 // NewLanguageDetectorBuilder returns a new instance that implements the
@@ -225,11 +240,17 @@ func (builder *languageDetectorBuilder) WithPreloadedLanguageModels() LanguageDe
 	return builder
 }
 
+func (builder *languageDetectorBuilder) WithLowAccuracyMode() LanguageDetectorBuilder {
+	builder.isLowAccuracyModeEnabled = true
+	return builder
+}
+
 func (builder *languageDetectorBuilder) Build() LanguageDetector {
 	return newLanguageDetector(
 		builder.languages,
 		builder.minimumRelativeDistance,
 		builder.isEveryLanguageModelPreloaded,
+		builder.isLowAccuracyModeEnabled,
 	)
 }
 
@@ -245,6 +266,7 @@ func (builder *languageDetectorBuilder) from(languages []Language) LanguageDetec
 	builder.languages = removeDuplicateLanguages(languages)
 	builder.minimumRelativeDistance = 0.0
 	builder.isEveryLanguageModelPreloaded = false
+	builder.isLowAccuracyModeEnabled = false
 	return builder
 }
 

diff --git a/cmd/accuracy-reports/aggregated-accuracy-values.csv b/cmd/accuracy-reports/aggregated-accuracy-values.csv
@@ -1,76 +1,76 @@
-language,average-whatlang,single-words-whatlang,word-pairs-whatlang,sentences-whatlang,average-cld3,single-words-cld3,word-pairs-cld3,sentences-cld3,average-lingua,single-words-lingua,word-pairs-lingua,sentences-lingua
-Afrikaans,51,21,39,92,55,22,46,98,79,58,81,97
-Albanian,NaN,NaN,NaN,NaN,55,18,48,98,88,69,95,100
-Arabic,89,77,91,99,90,79,92,100,98,96,99,100
-Armenian,NaN,NaN,NaN,NaN,99,100,100,97,100,100,100,100
-Azerbaijani,64,45,58,91,81,62,82,99,88,77,88,99
-Basque,NaN,NaN,NaN,NaN,62,33,62,92,84,71,87,93
-Belarusian,81,64,80,98,84,67,86,100,97,91,99,100
-Bengali,100,100,100,100,99,98,99,99,100,100,100,100
-Bokmal,34,15,29,60,NaN,NaN,NaN,NaN,58,39,59,75
-Bosnian,NaN,NaN,NaN,NaN,33,19,28,52,35,29,35,40
-Bulgarian,61,37,57,89,70,45,66,98,87,70,91,99
-Catalan,NaN,NaN,NaN,NaN,48,19,42,84,70,51,74,86
-Chinese,100,100,100,100,92,92,83,100,100,100,100,100
-Croatian,55,28,44,91,42,26,42,58,72,53,74,89
-Czech,50,31,46,71,64,39,65,88,79,65,82,90
-Danish,47,24,38,79,58,26,54,95,81,61,84,98
-Dutch,47,22,36,82,58,29,47,97,77,55,81,96
-English,49,17,35,94,54,22,44,97,81,55,89,99
-Esperanto,52,25,45,88,57,22,51,98,82,67,80,97
-Estonian,61,36,53,94,70,41,69,99,92,80,96,100
-Finnish,71,45,70,98,80,58,84,99,96,90,98,100
-French,65,37,59,97,55,22,49,94,89,74,94,99
-Ganda,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,91,79,95,100
-Georgian,100,100,100,100,98,99,100,96,100,100,100,100
-German,65,38,60,97,66,40,62,98,89,74,94,100
-Greek,100,100,100,100,100,100,100,100,100,100,100,100
-Gujarati,100,100,100,100,100,99,100,100,100,100,100,100
-Hebrew,90,76,94,99,NaN,NaN,NaN,NaN,100,100,100,100
-Hindi,52,27,40,88,58,34,45,95,73,61,64,93
-Hungarian,62,38,53,95,76,53,76,99,95,86,98,100
-Icelandic,NaN,NaN,NaN,NaN,71,42,70,99,91,79,95,100
-Indonesian,67,39,66,95,46,26,45,66,60,39,61,81
-Irish,NaN,NaN,NaN,NaN,67,42,66,94,91,82,94,96
-Italian,56,25,47,96,62,31,57,98,87,69,92,100
-Japanese,99,100,100,97,98,97,96,100,100,100,100,100
-Kazakh,NaN,NaN,NaN,NaN,82,62,83,99,90,78,94,99
-Korean,100,100,100,100,99,100,100,98,100,100,100,100
-Latin,NaN,NaN,NaN,NaN,62,44,58,83,87,72,93,97
-Latvian,59,36,54,87,75,51,77,98,93,84,96,98
-Lithuanian,62,38,56,92,72,42,75,99,94,86,96,100
-Macedonian,62,39,55,94,60,30,54,97,83,66,86,98
-Malay,NaN,NaN,NaN,NaN,22,11,22,34,31,26,38,30
-Maori,NaN,NaN,NaN,NaN,52,22,43,91,91,82,92,99
-Marathi,73,52,74,93,84,69,84,98,85,74,85,96
-Mongolian,NaN,NaN,NaN,NaN,83,63,87,99,97,93,98,99
-Nynorsk,34,10,24,69,NaN,NaN,NaN,NaN,66,41,66,90
-Persian,70,46,66,99,76,57,70,99,90,78,94,99
-Polish,66,45,59,94,77,51,80,99,94,85,97,100
-Portuguese,57,27,48,96,53,21,40,97,81,59,85,98
-Punjabi,100,100,100,100,100,99,100,100,100,100,100,100
-Romanian,59,35,52,90,53,24,48,88,86,69,91,99
-Russian,53,40,52,68,71,48,72,93,90,76,95,98
-Serbian,57,34,51,86,78,63,75,95,87,74,89,99
-Shona,68,44,65,95,76,51,79,99,91,78,96,100
-Slovak,NaN,NaN,NaN,NaN,63,32,61,96,83,63,89,98
-Slovene,48,25,38,81,63,29,60,99,82,61,87,98
-Somali,68,38,66,99,69,38,70,100,92,82,96,100
-Sotho,NaN,NaN,NaN,NaN,49,15,33,98,85,67,90,99
-Spanish,48,19,33,93,48,16,32,96,70,44,69,97
-Swahili,NaN,NaN,NaN,NaN,57,25,49,98,81,60,84,98
-Swedish,49,24,40,83,61,30,56,96,84,64,88,99
-Tagalog,52,23,43,90,NaN,NaN,NaN,NaN,78,52,83,99
-Tamil,100,100,100,100,100,100,100,99,100,100,100,100
-Telugu,100,100,100,100,99,99,100,99,100,100,100,100
-Thai,100,100,100,99,99,100,100,98,99,100,100,98
-Tsonga,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,84,66,89,98
-Tswana,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,84,65,88,99
-Turkish,54,26,44,92,69,41,70,97,94,84,97,100
-Ukrainian,72,53,71,93,81,62,83,98,92,84,97,95
-Urdu,57,31,46,94,61,39,53,92,86,72,88,97
-Vietnamese,73,36,85,97,66,26,74,99,84,78,87,87
-Welsh,NaN,NaN,NaN,NaN,69,43,66,98,91,78,96,99
-Xhosa,NaN,NaN,NaN,NaN,66,40,65,92,82,64,85,98
-Yoruba,22,11,14,41,15,5,11,28,74,50,77,96
-Zulu,70,44,68,98,63,35,63,92,81,62,83,97
+language,average-whatlang,single-words-whatlang,word-pairs-whatlang,sentences-whatlang,average-cld3,single-words-cld3,word-pairs-cld3,sentences-cld3,average-lingua-low,single-words-lingua-low,word-pairs-lingua-low,sentences-lingua-low,average-lingua-high,single-words-lingua-high,word-pairs-lingua-high,sentences-lingua-high
+Afrikaans,51,21,39,92,55,22,46,98,64,38,62,93,79,58,81,97
+Albanian,NaN,NaN,NaN,NaN,55,18,48,98,80,54,86,99,88,69,95,100
+Arabic,89,77,91,99,90,79,92,100,94,88,96,99,98,96,99,100
+Armenian,NaN,NaN,NaN,NaN,99,100,100,97,100,100,100,100,100,100,100,100
+Azerbaijani,65,45,58,91,81,62,82,99,82,71,78,96,90,77,92,99
+Basque,NaN,NaN,NaN,NaN,62,33,62,92,74,56,76,91,84,71,87,93
+Belarusian,81,64,80,98,84,67,86,100,92,80,95,100,97,92,99,100
+Bengali,100,100,100,100,99,98,99,99,100,100,100,100,100,100,100,100
+Bokmal,34,15,29,60,NaN,NaN,NaN,NaN,49,27,47,74,58,39,59,75
+Bosnian,NaN,NaN,NaN,NaN,33,19,28,52,29,23,29,36,35,29,35,40
+Bulgarian,61,37,57,89,70,45,66,98,78,56,81,96,87,70,91,99
+Catalan,NaN,NaN,NaN,NaN,48,19,42,84,58,33,60,81,70,51,74,86
+Chinese,100,100,100,100,92,92,83,100,100,100,100,100,100,100,100,100
+Croatian,55,28,44,91,42,26,42,58,60,36,57,85,72,53,74,90
+Czech,50,31,46,71,64,39,65,88,71,54,72,87,80,66,84,91
+Danish,47,24,38,79,58,26,54,95,70,45,70,95,81,61,84,98
+Dutch,47,22,36,82,58,29,47,97,64,36,61,94,77,55,81,96
+English,49,17,36,94,54,22,44,97,62,29,62,96,81,55,89,99
+Esperanto,52,25,45,88,57,22,51,98,66,44,61,92,84,67,85,98
+Estonian,61,36,53,94,70,41,69,99,83,62,88,99,92,80,96,100
+Finnish,71,45,70,98,80,58,84,99,91,77,95,100,96,90,98,100
+French,64,37,59,97,55,22,49,94,77,52,83,97,89,74,94,99
+Ganda,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,84,65,87,100,91,79,95,100
+Georgian,100,100,100,100,98,99,100,96,100,100,100,100,100,100,100,100
+German,65,38,60,97,66,40,62,98,80,57,84,99,89,74,94,100
+Greek,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100
+Gujarati,100,100,100,100,100,99,100,100,100,100,100,100,100,100,100,100
+Hebrew,90,76,94,99,NaN,NaN,NaN,NaN,100,100,100,100,100,100,100,100
+Hindi,52,27,40,88,58,34,45,95,33,11,20,67,73,61,64,93
+Hungarian,62,38,53,95,76,53,76,99,90,77,94,100,95,87,98,100
+Icelandic,NaN,NaN,NaN,NaN,71,42,70,99,88,72,92,99,93,83,97,100
+Indonesian,67,39,66,95,46,26,45,66,48,25,46,72,60,39,61,81
+Irish,NaN,NaN,NaN,NaN,67,42,66,94,85,70,90,95,91,82,94,96
+Italian,56,25,47,96,62,31,57,98,71,42,74,98,87,69,92,100
+Japanese,99,100,100,97,98,97,96,100,100,100,100,100,100,100,100,100
+Kazakh,NaN,NaN,NaN,NaN,82,62,83,99,90,78,93,99,92,80,96,99
+Korean,100,100,100,100,99,100,100,98,100,100,100,100,100,100,100,100
+Latin,NaN,NaN,NaN,NaN,62,44,58,83,73,49,76,93,87,72,93,97
+Latvian,59,36,54,87,75,51,77,98,87,75,90,97,93,85,97,99
+Lithuanian,62,38,56,92,72,42,75,99,87,76,89,98,95,86,98,100
+Macedonian,62,39,55,94,60,30,54,97,72,52,70,95,84,66,86,99
+Malay,NaN,NaN,NaN,NaN,22,11,22,34,31,22,36,36,31,26,38,30
+Maori,NaN,NaN,NaN,NaN,52,22,43,91,82,62,87,98,91,82,92,99
+Marathi,73,52,74,93,84,69,84,98,41,20,30,72,85,74,85,96
+Mongolian,NaN,NaN,NaN,NaN,83,63,87,99,96,89,98,99,97,93,99,99
+Nynorsk,34,10,24,69,NaN,NaN,NaN,NaN,52,25,49,81,66,41,66,90
+Persian,70,46,66,99,76,57,70,99,80,62,80,98,90,78,94,100
+Polish,66,45,59,94,77,51,80,99,90,77,93,99,95,85,98,100
+Portuguese,57,26,48,96,53,21,40,97,69,42,70,95,81,59,85,98
+Punjabi,100,100,100,100,100,99,100,100,100,100,100,100,100,100,100,100
+Romanian,59,34,52,90,53,24,48,88,72,49,74,94,87,69,92,99
+Russian,53,40,52,68,71,48,72,93,78,59,84,92,90,76,95,98
+Serbian,57,34,51,86,78,63,75,95,78,62,80,91,88,74,90,99
+Shona,68,44,65,95,76,51,79,99,81,56,86,100,91,78,96,100
+Slovak,NaN,NaN,NaN,NaN,63,32,61,96,75,49,78,97,84,64,90,99
+Slovene,48,25,38,81,63,29,60,99,67,39,68,93,82,61,87,99
+Somali,68,38,66,99,69,38,70,100,85,64,90,100,92,82,96,100
+Sotho,NaN,NaN,NaN,NaN,49,15,33,98,72,43,75,97,85,67,90,99
+Spanish,48,19,33,93,48,16,32,96,56,26,49,94,70,44,69,97
+Swahili,NaN,NaN,NaN,NaN,57,25,49,98,70,43,68,97,81,60,84,98
+Swedish,49,24,39,83,61,30,56,96,72,46,76,95,84,64,88,99
+Tagalog,52,23,43,90,NaN,NaN,NaN,NaN,66,36,67,96,78,52,83,99
+Tamil,100,100,100,100,100,100,100,99,100,100,100,100,100,100,100,100
+Telugu,100,100,100,100,99,99,100,99,100,100,100,100,100,100,100,100
+Thai,100,100,100,99,99,100,100,98,99,100,100,98,99,100,100,98
+Tsonga,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,72,46,73,97,84,66,89,98
+Tswana,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,71,44,73,96,84,65,88,99
+Turkish,54,26,44,92,69,41,70,97,87,71,91,99,94,84,98,100
+Ukrainian,72,53,71,93,81,62,83,98,86,75,92,93,92,84,97,95
+Urdu,57,31,46,94,61,39,53,92,80,65,78,96,91,80,94,98
+Vietnamese,73,36,85,97,66,26,74,99,87,76,87,98,91,79,94,99
+Welsh,NaN,NaN,NaN,NaN,69,43,66,98,82,61,87,99,91,78,96,99
+Xhosa,NaN,NaN,NaN,NaN,66,40,65,92,69,45,67,94,82,64,85,98
+Yoruba,22,11,14,41,15,5,11,28,62,33,61,93,75,50,77,97
+Zulu,70,44,68,98,63,35,63,92,70,45,72,94,81,62,83,97
diff --git a/cmd/accuracy-reports/lingua/Afrikaans.txt → ...eports/lingua-high-accuracy/Afrikaans.txt b/cmd/accuracy-reports/lingua/Afrikaans.txt → ...eports/lingua-high-accuracy/Afrikaans.txt
diff --git a/cmd/accuracy-reports/lingua/Albanian.txt → ...reports/lingua-high-accuracy/Albanian.txt b/cmd/accuracy-reports/lingua/Albanian.txt → ...reports/lingua-high-accuracy/Albanian.txt
diff --git a/cmd/accuracy-reports/lingua/Arabic.txt → ...y-reports/lingua-high-accuracy/Arabic.txt b/cmd/accuracy-reports/lingua/Arabic.txt → ...y-reports/lingua-high-accuracy/Arabic.txt
diff --git a/cmd/accuracy-reports/lingua/Armenian.txt → ...reports/lingua-high-accuracy/Armenian.txt b/cmd/accuracy-reports/lingua/Armenian.txt → ...reports/lingua-high-accuracy/Armenian.txt
diff --git a/cmd/accuracy-reports/lingua/Azerbaijani.txt → ...orts/lingua-high-accuracy/Azerbaijani.txt b/cmd/accuracy-reports/lingua/Azerbaijani.txt → ...orts/lingua-high-accuracy/Azerbaijani.txt
@@ -1,16 +1,16 @@
 ##### Azerbaijani #####
 
->>> Accuracy on average: 87.93%
+>>> Accuracy on average: 89.57%
 
 >> Detection of 1000 single words (average length: 8 chars)
-Accuracy: 77.20%
-Erroneously classified as Turkish: 8.10%, Basque: 1.00%, Latin: 0.90%, Albanian: 0.80%, English: 0.70%, Tagalog: 0.70%, Esperanto: 0.60%, Lithuanian: 0.60%, Zulu: 0.60%, Danish: 0.50%, Somali: 0.50%, Swahili: 0.50%, Xhosa: 0.50%, Estonian: 0.40%, Ganda: 0.40%, Malay: 0.40%, Tsonga: 0.40%, Tswana: 0.40%, Yoruba: 0.40%, Bosnian: 0.30%, Italian: 0.30%, Nynorsk: 0.30%, Portuguese: 0.30%, Romanian: 0.30%, Spanish: 0.30%, Swedish: 0.30%, Dutch: 0.20%, Finnish: 0.20%, Indonesian: 0.20%, Shona: 0.20%, Slovene: 0.20%, Welsh: 0.20%, Afrikaans: 0.10%, Bokmal: 0.10%, Croatian: 0.10%, French: 0.10%, German: 0.10%, Hungarian: 0.10%, Icelandic: 0.10%, Irish: 0.10%, Latvian: 0.10%, Maori: 0.10%, Sotho: 0.10%
+Accuracy: 77.40%
+Erroneously classified as Turkish: 8.10%, Latin: 0.90%, Albanian: 0.80%, Basque: 0.80%, English: 0.70%, Tagalog: 0.70%, Esperanto: 0.60%, Lithuanian: 0.60%, Zulu: 0.60%, Danish: 0.50%, Somali: 0.50%, Swahili: 0.50%, Xhosa: 0.50%, Estonian: 0.40%, Ganda: 0.40%, Malay: 0.40%, Tsonga: 0.40%, Tswana: 0.40%, Yoruba: 0.40%, Bosnian: 0.30%, Italian: 0.30%, Nynorsk: 0.30%, Portuguese: 0.30%, Romanian: 0.30%, Spanish: 0.30%, Swedish: 0.30%, Dutch: 0.20%, Finnish: 0.20%, Indonesian: 0.20%, Shona: 0.20%, Slovene: 0.20%, Welsh: 0.20%, Afrikaans: 0.10%, Bokmal: 0.10%, Croatian: 0.10%, French: 0.10%, German: 0.10%, Hungarian: 0.10%, Icelandic: 0.10%, Irish: 0.10%, Latvian: 0.10%, Maori: 0.10%, Sotho: 0.10%
 
 >> Detection of 1000 word pairs (average length: 16 chars)
-Accuracy: 87.90%
-Erroneously classified as Turkish: 7.80%, Basque: 0.50%, Swahili: 0.40%, Italian: 0.30%, Spanish: 0.30%, Albanian: 0.20%, Esperanto: 0.20%, Indonesian: 0.20%, Latin: 0.20%, Malay: 0.20%, Shona: 0.20%, Somali: 0.20%, Bosnian: 0.10%, Danish: 0.10%, Dutch: 0.10%, Estonian: 0.10%, Finnish: 0.10%, German: 0.10%, Latvian: 0.10%, Polish: 0.10%, Swedish: 0.10%, Tagalog: 0.10%, Tswana: 0.10%, Xhosa: 0.10%, Yoruba: 0.10%, Zulu: 0.10%
+Accuracy: 92.30%
+Erroneously classified as Turkish: 4.70%, Italian: 0.30%, Albanian: 0.20%, Basque: 0.20%, Esperanto: 0.20%, Indonesian: 0.20%, Latin: 0.20%, Shona: 0.20%, Swahili: 0.20%, Bosnian: 0.10%, Danish: 0.10%, Dutch: 0.10%, German: 0.10%, Latvian: 0.10%, Malay: 0.10%, Polish: 0.10%, Somali: 0.10%, Swedish: 0.10%, Tagalog: 0.10%, Tswana: 0.10%, Xhosa: 0.10%, Zulu: 0.10%
 
 >> Detection of 1000 sentences (average length: 107 chars)
-Accuracy: 98.70%
-Erroneously classified as Turkish: 1.00%, Afrikaans: 0.10%, Sotho: 0.10%, Tagalog: 0.10%
+Accuracy: 99.00%
+Erroneously classified as Turkish: 0.80%, Sotho: 0.10%, Tagalog: 0.10%
 
diff --git a/cmd/accuracy-reports/lingua/Basque.txt → ...y-reports/lingua-high-accuracy/Basque.txt b/cmd/accuracy-reports/lingua/Basque.txt → ...y-reports/lingua-high-accuracy/Basque.txt
diff --git a/cmd/accuracy-reports/lingua-high-accuracy/Belarusian.txt b/cmd/accuracy-reports/lingua-high-accuracy/Belarusian.txt
@@ -0,0 +1,16 @@
+##### Belarusian #####
+
+>>> Accuracy on average: 96.87%
+
+>> Detection of 1000 single words (average length: 8 chars)
+Accuracy: 91.50%
+Erroneously classified as Russian: 2.90%, Ukrainian: 2.10%, Serbian: 1.00%, Kazakh: 0.90%, Bulgarian: 0.60%, Macedonian: 0.60%, Mongolian: 0.40%
+
+>> Detection of 1000 word pairs (average length: 17 chars)
+Accuracy: 99.20%
+Erroneously classified as Russian: 0.50%, Bulgarian: 0.10%, Macedonian: 0.10%, Ukrainian: 0.10%
+
+>> Detection of 1000 sentences (average length: 105 chars)
+Accuracy: 99.90%
+Erroneously classified as Kazakh: 0.10%
+
diff --git a/cmd/accuracy-reports/lingua/Bengali.txt → ...-reports/lingua-high-accuracy/Bengali.txt b/cmd/accuracy-reports/lingua/Bengali.txt → ...-reports/lingua-high-accuracy/Bengali.txt
diff --git a/cmd/accuracy-reports/lingua/Bokmal.txt → ...y-reports/lingua-high-accuracy/Bokmal.txt b/cmd/accuracy-reports/lingua/Bokmal.txt → ...y-reports/lingua-high-accuracy/Bokmal.txt
@@ -12,5 +12,5 @@ Erroneously classified as Nynorsk: 23.70%, Danish: 12.70%, Swedish: 1.40%, Germa
 
 >> Detection of 1000 sentences (average length: 98 chars)
 Accuracy: 75.40%
-Erroneously classified as Nynorsk: 22.00%, Danish: 2.20%, Afrikaans: 0.10%, Dutch: 0.10%, English: 0.10%, Swedish: 0.10%
+Erroneously classified as Nynorsk: 22.10%, Danish: 2.20%, Dutch: 0.10%, English: 0.10%, Swedish: 0.10%
 
diff --git a/cmd/accuracy-reports/lingua/Bosnian.txt → ...-reports/lingua-high-accuracy/Bosnian.txt b/cmd/accuracy-reports/lingua/Bosnian.txt → ...-reports/lingua-high-accuracy/Bosnian.txt
diff --git a/cmd/accuracy-reports/lingua/Bulgarian.txt → ...eports/lingua-high-accuracy/Bulgarian.txt b/cmd/accuracy-reports/lingua/Bulgarian.txt → ...eports/lingua-high-accuracy/Bulgarian.txt
diff --git a/cmd/accuracy-reports/lingua/Catalan.txt → ...-reports/lingua-high-accuracy/Catalan.txt b/cmd/accuracy-reports/lingua/Catalan.txt → ...-reports/lingua-high-accuracy/Catalan.txt
@@ -1,16 +1,16 @@
 ##### Catalan #####
 
->>> Accuracy on average: 70.03%
+>>> Accuracy on average: 70.17%
 
 >> Detection of 1000 single words (average length: 8 chars)
 Accuracy: 50.60%
 Erroneously classified as Spanish: 7.80%, Portuguese: 6.20%, French: 5.70%, Italian: 3.50%, Latin: 3.50%, English: 2.70%, Romanian: 2.50%, Basque: 2.10%, Esperanto: 1.90%, Yoruba: 1.10%, Tswana: 1.00%, Shona: 0.80%, Dutch: 0.70%, Somali: 0.70%, Icelandic: 0.60%, Sotho: 0.60%, Swahili: 0.60%, Afrikaans: 0.50%, Indonesian: 0.50%, Malay: 0.50%, Swedish: 0.50%, Turkish: 0.50%, Albanian: 0.40%, German: 0.40%, Nynorsk: 0.40%, Bokmal: 0.30%, Croatian: 0.30%, Finnish: 0.30%, Zulu: 0.30%, Bosnian: 0.20%, Danish: 0.20%, Estonian: 0.20%, Hungarian: 0.20%, Polish: 0.20%, Tagalog: 0.20%, Tsonga: 0.20%, Vietnamese: 0.20%, Welsh: 0.20%, Xhosa: 0.20%, Irish: 0.10%, Latvian: 0.10%, Lithuanian: 0.10%, Maori: 0.10%, Slovene: 0.10%
 
 >> Detection of 1000 word pairs (average length: 16 chars)
-Accuracy: 73.80%
-Erroneously classified as Spanish: 8.90%, Portuguese: 3.70%, French: 2.90%, Yoruba: 2.20%, Italian: 1.90%, Latin: 1.90%, English: 1.60%, Romanian: 0.40%, Swahili: 0.30%, Dutch: 0.20%, Esperanto: 0.20%, Irish: 0.20%, Slovak: 0.20%, Tagalog: 0.20%, Welsh: 0.20%, Afrikaans: 0.10%, Albanian: 0.10%, Basque: 0.10%, Finnish: 0.10%, German: 0.10%, Hungarian: 0.10%, Lithuanian: 0.10%, Nynorsk: 0.10%, Sotho: 0.10%, Tsonga: 0.10%, Tswana: 0.10%, Vietnamese: 0.10%
+Accuracy: 73.90%
+Erroneously classified as Spanish: 8.90%, Portuguese: 3.70%, French: 2.90%, Yoruba: 2.20%, Italian: 1.90%, Latin: 1.90%, English: 1.60%, Romanian: 0.40%, Swahili: 0.30%, Basque: 0.20%, Dutch: 0.20%, Esperanto: 0.20%, Irish: 0.20%, Slovak: 0.20%, Tagalog: 0.20%, Welsh: 0.20%, Afrikaans: 0.10%, Albanian: 0.10%, Finnish: 0.10%, German: 0.10%, Hungarian: 0.10%, Lithuanian: 0.10%, Nynorsk: 0.10%, Tsonga: 0.10%, Vietnamese: 0.10%
 
 >> Detection of 1000 sentences (average length: 103 chars)
-Accuracy: 85.70%
-Erroneously classified as Spanish: 6.40%, English: 1.70%, Latin: 1.40%, French: 1.00%, Yoruba: 0.70%, Tagalog: 0.50%, Basque: 0.30%, Italian: 0.30%, Portuguese: 0.30%, Tsonga: 0.30%, Romanian: 0.20%, Swahili: 0.20%, Vietnamese: 0.20%, Danish: 0.10%, Esperanto: 0.10%, Finnish: 0.10%, German: 0.10%, Malay: 0.10%, Slovene: 0.10%, Tswana: 0.10%, Xhosa: 0.10%
+Accuracy: 86.00%
+Erroneously classified as Spanish: 6.60%, English: 1.80%, Latin: 1.40%, French: 1.00%, Yoruba: 0.60%, Portuguese: 0.40%, Basque: 0.30%, Italian: 0.30%, Romanian: 0.20%, Swahili: 0.20%, Tagalog: 0.20%, Vietnamese: 0.20%, Danish: 0.10%, Esperanto: 0.10%, Finnish: 0.10%, German: 0.10%, Malay: 0.10%, Slovene: 0.10%, Tswana: 0.10%, Xhosa: 0.10%
 
diff --git a/cmd/accuracy-reports/lingua/Chinese.txt → ...-reports/lingua-high-accuracy/Chinese.txt b/cmd/accuracy-reports/lingua/Chinese.txt → ...-reports/lingua-high-accuracy/Chinese.txt
diff --git a/cmd/accuracy-reports/lingua/Croatian.txt → ...reports/lingua-high-accuracy/Croatian.txt b/cmd/accuracy-reports/lingua/Croatian.txt → ...reports/lingua-high-accuracy/Croatian.txt
@@ -1,6 +1,6 @@
 ##### Croatian #####
 
->>> Accuracy on average: 72.33%
+>>> Accuracy on average: 72.40%
 
 >> Detection of 1000 single words (average length: 8 chars)
 Accuracy: 53.40%
@@ -11,6 +11,6 @@ Accuracy: 74.30%
 Erroneously classified as Bosnian: 19.00%, Slovene: 3.50%, Slovak: 0.70%, English: 0.50%, Basque: 0.20%, Latin: 0.20%, Lithuanian: 0.20%, Polish: 0.20%, Swahili: 0.20%, Turkish: 0.20%, Afrikaans: 0.10%, Albanian: 0.10%, Czech: 0.10%, Esperanto: 0.10%, Italian: 0.10%, Nynorsk: 0.10%, Portuguese: 0.10%, Romanian: 0.10%
 
 >> Detection of 1000 sentences (average length: 127 chars)
-Accuracy: 89.30%
-Erroneously classified as Bosnian: 10.20%, Latin: 0.10%, Shona: 0.10%, Somali: 0.10%, Swahili: 0.10%, Tsonga: 0.10%
+Accuracy: 89.50%
+Erroneously classified as Bosnian: 10.30%, Latin: 0.10%, Shona: 0.10%