Skip to content

Commit eff67f0

Browse files
committed
Add Russian language support
1 parent 21a5aff commit eff67f0

File tree

2 files changed

+169
-0
lines changed

2 files changed

+169
-0
lines changed
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
package translit
2+
3+
import translit.Helpers._
4+
5+
object Russian extends Language {
6+
val uniGrams = Map(
7+
'a' -> 'а',
8+
'b' -> 'б',
9+
'v' -> 'в',
10+
'g' -> 'г',
11+
'd' -> 'д',
12+
'e' -> 'е',
13+
'z' -> 'з',
14+
'i' -> 'и',
15+
'j' -> 'й',
16+
'k' -> 'к',
17+
'l' -> 'л',
18+
'm' -> 'м',
19+
'n' -> 'н',
20+
'o' -> 'о',
21+
'p' -> 'п',
22+
'r' -> 'р',
23+
's' -> 'с',
24+
't' -> 'т',
25+
'u' -> 'у',
26+
'f' -> 'ф',
27+
'x' -> 'х',
28+
'h' -> 'х',
29+
'c' -> 'ц',
30+
'w' -> 'щ',
31+
'#' -> 'ъ',
32+
'y' -> 'ы'
33+
)
34+
35+
val biGrams = Map(
36+
"jo" -> 'ё',
37+
"yo" -> 'ё',
38+
"zh" -> 'ж',
39+
"ch" -> 'ч',
40+
"sh" -> 'ш',
41+
"ye" -> 'э',
42+
"yu" -> 'ю',
43+
"ju" -> 'ю',
44+
"ya" -> 'я',
45+
"ja" -> 'я'
46+
)
47+
48+
val triGrams = Map(
49+
"shh" -> 'щ'
50+
)
51+
52+
// tried to use prefix rules but there are many exceptions in Russian language
53+
// Ex.: фольклор, пальцем
54+
val apostropheSuffix = Set(
55+
"ya",
56+
"ja",
57+
"yo",
58+
"jo",
59+
"i",
60+
"e",
61+
"yu",
62+
"yu",
63+
"",
64+
)
65+
66+
val apostrophePrefix = Set(
67+
"b",
68+
"v",
69+
"d",
70+
"z",
71+
"k",
72+
"l",
73+
"m",
74+
"n",
75+
"p",
76+
"r",
77+
"c",
78+
"t",
79+
"sh"
80+
)
81+
82+
/**
83+
* Converts one character starting from `offset`
84+
*
85+
* @return (-2, c) Replace last two characters by `c`
86+
* (-1, c) Replace last character by `c`
87+
* ( 0, c) Append character `c`
88+
*/
89+
def latinToCyrillicOfs(text: String,
90+
offset: Int,
91+
apostrophes: Boolean = true): (Int, Char) = {
92+
val ofs = offset + 1
93+
if (ofs >= 3 &&
94+
triGrams.contains(text.substring(ofs - 3, ofs).toLowerCase)) {
95+
val chars = text.substring(ofs - 3, ofs)
96+
val cyrillic = triGrams(chars.toLowerCase)
97+
(-1, restoreCaseAll(chars, cyrillic))
98+
} else if (ofs >= 2 &&
99+
biGrams.contains(text.substring(ofs - 2, ofs).toLowerCase)) {
100+
val chars = text.substring(ofs - 2, ofs)
101+
val cyrillic = biGrams(chars.toLowerCase)
102+
(-1, restoreCaseFirst(chars, cyrillic))
103+
} else if (uniGrams.contains(text(ofs - 1).toLower)) {
104+
val cyrillic = uniGrams(text(ofs - 1).toLower)
105+
(0, if (text(ofs - 1).isUpper) cyrillic.toUpper else cyrillic)
106+
} else if (text(ofs - 1) == '\'' && apostrophes && (
107+
apostrophePrefix.contains(text.slice(ofs - 3, ofs - 1)) ||
108+
apostrophePrefix.contains(text.slice(ofs - 2, ofs - 1))
109+
)) {
110+
if (text(ofs - 2).isUpper) (0, 'Ь') else (0, 'ь')
111+
} else {
112+
(0, text(ofs - 1))
113+
}
114+
}
115+
}
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
package translit
2+
3+
import org.scalatest.FunSuite
4+
5+
class RussianSpec extends FunSuite {
6+
val correctMapping = List(
7+
"Андрей" -> "Andrej",
8+
"Борис" -> "Boris",
9+
"Валера" -> "Valera",
10+
"гвоздь" -> "gvozd'",
11+
"днище" -> "dnishhe",
12+
"Емеля" -> "Emelya",
13+
"ёлка" -> "yolka",
14+
"ёлка" -> "jolka",
15+
"железо" -> "zhelezo",
16+
"зыбь" -> "zyb'",
17+
"Ильин" -> "Il'in",
18+
"Йемен" -> "Jemen",
19+
"киянка" -> "kiyanka",
20+
"лещ" -> "leshh",
21+
"мышьяк" -> "mysh'yak",
22+
"Новгород" -> "Novgorod",
23+
"овраг" -> "ovrag",
24+
"пьянство" -> "p'yanstvo",
25+
"роща" -> "roshha",
26+
"съел" -> "s#el",
27+
"тележка" -> "telezhka",
28+
"ухват" -> "uxvat",
29+
"ухват" -> "uhvat",
30+
"фольклор" -> "fol'klor",
31+
"халтура" -> "haltura",
32+
"цвет" -> "cvet",
33+
"червь" -> "cherv'",
34+
"швея" -> "shveya",
35+
"щавель" -> "shhavel'",
36+
"электровоз" -> "yelektrovoz",
37+
"юла" -> "yula",
38+
"ягненок" -> "yagnenok",
39+
)
40+
41+
def removeApostropheAndSoftSign(str: String): String =
42+
str
43+
.replaceAll("ь", "")
44+
.replaceAll("'", "")
45+
46+
correctMapping.foreach {
47+
case (cyrillic, latin) =>
48+
test(s"$latin -> $cyrillic") {
49+
assert(
50+
Russian.latinToCyrillic(latin) ==
51+
cyrillic)
52+
}
53+
}
54+
}

0 commit comments

Comments
 (0)