Skip to content

Commit 199d73a

Browse files
committed
Improve incremental transliteration
1 parent a658b9f commit 199d73a

File tree

6 files changed

+51
-48
lines changed

6 files changed

+51
-48
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,3 +151,4 @@ translit-scala is licensed under the terms of the Apache v2.0 licence.
151151

152152
## Contributors
153153
* Tim Nieradzik
154+
* Darkhan Kubigenov

shared/src/main/scala/translit/Language.scala

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,21 +4,26 @@ trait Language {
44
def latinToCyrillicOfs(text: String,
55
offset: Int,
66
apostrophes: Boolean = true,
7-
incrementalTranslit: Boolean = false): (Int, Char)
7+
incremental: Boolean = false): (Int, Char)
88

99
def latinToCyrillic(text: String,
1010
apostrophes: Boolean = true,
11-
incrementalTranslit: Boolean = false): String = {
11+
incremental: Boolean = false): String = {
1212
val result = new StringBuilder(text.length)
1313
var offset = 0
1414

1515
while (offset < text.length) {
16-
val (length, c) = latinToCyrillicOfs(text, offset, apostrophes, incrementalTranslit)
16+
val (length, c) = latinToCyrillicOfs(text, offset, apostrophes, incremental)
1717
if (length < 0) result.setLength(result.length + length)
1818
result.append(c)
1919
offset += 1
2020
}
2121

2222
result.mkString
2323
}
24+
25+
def incrementalNgram(ngram: Map[String, Char]): Map[String, Char] =
26+
ngram.map { case (prefix, value) =>
27+
(latinToCyrillic(prefix.init) + prefix.last, value)
28+
}
2429
}

shared/src/main/scala/translit/Russian.scala

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package translit
33
import translit.Helpers._
44

55
object Russian extends Language {
6-
76
val uniGrams = Map(
87
'a' -> 'а',
98
'b' -> 'б',
@@ -43,8 +42,7 @@ object Russian extends Language {
4342
"yo" -> 'ё',
4443
"yu" -> 'ю'
4544
)
46-
47-
val biGramsIncremental = getIncrementalNgram(biGrams)
45+
val biGramsIncremental = incrementalNgram(biGrams)
4846

4947
val triGrams = Map.empty[String, Char]
5048
val triGramsIncremental = Map(
@@ -54,11 +52,6 @@ object Russian extends Language {
5452
val fourGrams = Map(
5553
"shch" -> 'щ'
5654
)
57-
val fourGramsIncremental = Map.empty[String, Char]
58-
59-
def getIncrementalNgram(ngram: Map[String, Char]): Map[String, Char] = ngram ++ ngram.map { case (prefix, value) =>
60-
(latinToCyrillic(prefix.slice(0, prefix.length - 1), incrementalTranslit = true) + prefix.last, value)
61-
}
6255

6356
/**
6457
* Converts one character starting from `offset`
@@ -70,15 +63,16 @@ object Russian extends Language {
7063
def latinToCyrillicOfs(text: String,
7164
offset: Int,
7265
apostrophes: Boolean = true,
73-
incrementalTranslit: Boolean = false): (Int, Char) = {
74-
val (biGramsL, triGramsL, fourGramsL) =
75-
if (incrementalTranslit) (biGramsIncremental, triGramsIncremental, fourGramsIncremental)
76-
else (biGrams, triGrams, fourGrams)
66+
incremental: Boolean = false): (Int, Char) = {
67+
val (biGramsL, triGramsL) =
68+
if (incremental) (biGramsIncremental, triGramsIncremental)
69+
else (biGrams, triGrams)
70+
7771
val ofs = offset + 1
7872
if (ofs >= 4 &&
79-
fourGramsL.contains(text.substring(ofs - 4, ofs).toLowerCase)) {
73+
fourGrams.contains(text.substring(ofs - 4, ofs).toLowerCase)) {
8074
val chars = text.substring(ofs - 4, ofs)
81-
val cyrillic = fourGramsL(chars.toLowerCase)
75+
val cyrillic = fourGrams(chars.toLowerCase)
8276
(-2, restoreCaseFirst(chars, cyrillic))
8377
} else if (ofs >= 3 &&
8478
triGramsL.contains(text.substring(ofs - 3, ofs).toLowerCase)) {

shared/src/main/scala/translit/Ukrainian.scala

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -47,21 +47,20 @@ object Ukrainian extends Language {
4747
"ts" -> 'ц',
4848
"zh" -> 'ж'
4949
)
50-
51-
val biGramsIncremental = getIncrementalNgram(biGrams)
50+
val biGramsIncremental = incrementalNgram(biGrams)
5251

5352
val triGrams = Map(
5453
"zgh" -> 'г',
5554
)
56-
57-
val triGramsIncremental = getIncrementalNgram(triGrams) + ("шцh" -> 'щ')
55+
val triGramsIncremental = incrementalNgram(triGrams) ++ Map(
56+
"шцh" -> 'щ',
57+
"зґh" -> 'г'
58+
)
5859

5960
val fourGrams = Map(
6061
"shch" -> 'щ'
6162
)
6263

63-
val fourGramIncremental = Map.empty[String, Char]
64-
6564
val apostrophePatterns = Set(
6665
('b', "ya"),
6766
('b', "ye"),
@@ -90,10 +89,6 @@ object Ukrainian extends Language {
9089
('z', "yi")
9190
)
9291

93-
def getIncrementalNgram(ngram: Map[String, Char]): Map[String, Char] = ngram ++ ngram.map { case (prefix, value) =>
94-
(latinToCyrillic(prefix.slice(0, prefix.length - 1), incrementalTranslit = true) + prefix.last, value)
95-
}
96-
9792
/**
9893
* Converts one character starting from `offset`
9994
*
@@ -104,24 +99,25 @@ object Ukrainian extends Language {
10499
def latinToCyrillicOfs(text: String,
105100
offset: Int,
106101
apostrophes: Boolean = true,
107-
incrementalTranslit: Boolean = false): (Int, Char) = {
108-
val (biGramsL, triGramsL, fourGramsL) =
109-
if (incrementalTranslit) (biGramsIncremental, triGramsIncremental, fourGramIncremental)
110-
else (biGrams, triGrams, fourGrams)
102+
incremental: Boolean = false): (Int, Char) = {
103+
val (biGramsL, triGramsL) =
104+
if (incremental) (biGramsIncremental, triGramsIncremental)
105+
else (biGrams, triGrams)
111106
val ofs = offset + 1
112107
if (ofs >= 4 &&
113-
fourGramsL.contains(text.substring(ofs - 4, ofs).toLowerCase)
108+
fourGrams.contains(text.substring(ofs - 4, ofs).toLowerCase)
114109
) {
115110
val chars = text.substring(ofs - 4, ofs)
116-
val cyrillic = fourGramsL(chars.toLowerCase)
111+
val cyrillic = fourGrams(chars.toLowerCase)
117112
(-2, restoreCaseFirst(chars, cyrillic))
118113
} else if (ofs >= 3 &&
119114
triGramsL.contains(text.substring(ofs - 3, ofs).toLowerCase)
120115
) {
121116
val chars = text.substring(ofs - 3, ofs)
122117
val cyrillic = triGramsL(chars.toLowerCase)
123-
val newOffset = if (chars == "шцh") -2 else -1
124-
(newOffset, restoreCaseAll(chars, cyrillic))
118+
if (incremental && chars.equalsIgnoreCase("шцh"))
119+
(-2, restoreCaseFirst(chars, cyrillic))
120+
else (-1, restoreCaseAll(chars, cyrillic))
125121
} else if (ofs >= 2 &&
126122
biGramsL.contains(text.substring(ofs - 2, ofs).toLowerCase)
127123
) {

shared/src/test/scala/translit/RussianSpec.scala

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,20 +37,20 @@ class RussianSpec extends FunSuite {
3737
"ягненок" -> "yagnenok",
3838
)
3939

40-
correctMapping.foreach {
41-
case (cyrillic, latin) =>
42-
test(s"$latin -> $cyrillic") {
43-
assert(
44-
Russian.latinToCyrillic(latin) ==
45-
cyrillic)
46-
}
40+
correctMapping.foreach { case (cyrillic, latin) =>
41+
test(s"$latin -> $cyrillic") {
42+
assert(Russian.latinToCyrillic(latin) == cyrillic)
43+
}
4744
}
4845

49-
test("Incremental translit") {
50-
assert(Russian.latinToCyrillic("peсhkom", incrementalTranslit = true) == "пешком")
51-
assert(Russian.latinToCyrillic("зhizn'", incrementalTranslit = true) == "жизнь")
52-
assert(Russian.latinToCyrillic("zhizn'", incrementalTranslit = true) == "жизнь")
53-
assert(Russian.latinToCyrillic("багазh", incrementalTranslit = true) == "багаж")
54-
assert(Russian.latinToCyrillic("шцhetka", incrementalTranslit = true) == "щетка")
46+
test("Incremental transliteration") {
47+
assert(Russian.latinToCyrillic("зh" , incremental = true) == "ж")
48+
assert(Russian.latinToCyrillic("шцh" , incremental = true) == "щ")
49+
assert(Russian.latinToCyrillic("Шцh" , incremental = true) == "Щ")
50+
assert(Russian.latinToCyrillic("багазh", incremental = true) == "багаж")
51+
52+
assert(Russian.latinToCyrillic("peshkom" ) == "пешком")
53+
assert(Russian.latinToCyrillic("zhizn'" ) == "жизнь")
54+
assert(Russian.latinToCyrillic("shchetka") == "щетка")
5555
}
5656
}

shared/src/test/scala/translit/UkrainianSpec.scala

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,4 +278,11 @@ class UkrainianSpec extends FunSuite {
278278
assert(Ukrainian.latinToCyrillic("Puzata Xata") == "Пузата Хата")
279279
assert(Ukrainian.latinToCyrillic("cqwx") == "цщшх")
280280
}
281+
282+
test("Incremental transliteration") {
283+
assert(Ukrainian.latinToCyrillic("zgh") == "зг")
284+
assert(Ukrainian.latinToCyrillic("зґh", incremental = true) == "зг")
285+
286+
assert(Ukrainian.latinToCyrillic("Шцh", incremental = true) == "Щ")
287+
}
281288
}

0 commit comments

Comments
 (0)