Skip to content

Commit 5c0000d

Browse files
committed
Add function to transliterate one character at a time
1 parent 636e21a commit 5c0000d

File tree

5 files changed

+156
-137
lines changed

5 files changed

+156
-137
lines changed

README.md

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ There have been several attempts to standardise transliteration rules. For examp
3434
* Ukrayins'kyy pravopys (BGN/PCGN 1965)
3535
* Ukrains'kyi pravopys (National 1996)
3636
* Ukrainskyi pravopys ([National 2010](http://zakon1.rada.gov.ua/laws/show/55-2010-%D0%BF))
37-
* Ukrayins'kyy pravopys (*translit-scala*)
37+
* Ukrayins'kyj pravopys (*translit-scala*)
3838

3939
Furthermore, there are language-specific transliterations, e.g. in German and French, that use the spelling conventions of the respective language (*sch* in German instead of *sh* in English).
4040

@@ -60,25 +60,15 @@ The Latin letter *y* is also the phonetic basis of four letters in the Slavic al
6060

6161
Unlike National 2010, we always use the same transliteration regardless of the position in the word.
6262

63-
The accented counterpart of и is й. It is only used in conjunction with vowels. This lets us define the following rules without mapping й onto a separate letter:
63+
The accented counterpart of и is й and is represented by a separate letter, *j*.
6464

65-
* ay → ай
66-
* ey → ей
67-
* iy → ій
68-
* yy → ий
69-
* yo → йо
70-
71-
*Example:* Zghurskyy (Згурський)
72-
73-
Note that the four basic rules have a higher precedence over the й rules. This is needed for some words such as the following to be transliterated correctly:
74-
75-
* kofeyin (кофеїн instead of кофейін)
65+
*Example:* Zghurs'kyj (Згурський)
7666

7767
#### Soft Signs and Apostrophes
7868
The second change to National 2010 is that we try to restore soft signs and apostrophes:
7969

80-
* Ukrayins'kyy (Український)
81-
* malen'kyy (маленький)
70+
* Ukrayins'kyj (Український)
71+
* malen'kyj (маленький)
8272

8373
This feature is experimental and can be disabled by setting `apostrophes` to `false`.
8474

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
package translit
2+
3+
object Helpers {
4+
def restoreCaseAll(str: String, cyrillic: Char): Char =
5+
if (str.forall(_.isUpper)) cyrillic.toUpper else cyrillic
6+
7+
def restoreCaseFirst(str: String, cyrillic: Char): Char =
8+
if (str(0).isUpper) cyrillic.toUpper else cyrillic
9+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package translit
2+
3+
trait Language {
4+
def latinToCyrillicOfs(text: String,
5+
offset: Int,
6+
apostrophes: Boolean = true): (Int, Char)
7+
8+
def latinToCyrillic(text: String, apostrophes: Boolean = true): String = {
9+
val result = new StringBuilder(text.length)
10+
var offset = 0
11+
12+
while (offset < text.length) {
13+
val (length, c) = latinToCyrillicOfs(text, offset, apostrophes)
14+
if (length < 0) result.setLength(result.length + length)
15+
result.append(c)
16+
offset += 1
17+
}
18+
19+
result.mkString
20+
}
21+
}
Lines changed: 62 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package translit
22

3-
object Ukrainian {
3+
import Helpers._
4+
5+
object Ukrainian extends Language {
46
val uniGrams = Map(
57
'a' -> 'а',
68
'b' -> 'б',
@@ -22,54 +24,29 @@ object Ukrainian {
2224
'u' -> 'у',
2325
'v' -> 'в',
2426
'y' -> 'и',
27+
'j' -> 'й',
2528
'z' -> 'з'
2629
)
2730

2831
val biGrams = Map(
29-
"ya" -> "я",
30-
"ye" -> "є",
31-
"yi" -> "ї",
32-
"yu" -> "ю",
33-
34-
"ay" -> "ай",
35-
"ey" -> "ей",
36-
"iy" -> "ій",
37-
"yy" -> "ий",
38-
"yo" -> "йо",
39-
40-
"ch" -> "ч",
41-
"kh" -> "х",
42-
"sh" -> "ш",
43-
"ts" -> "ц",
44-
"zh" -> "ж"
32+
"ya" -> 'я',
33+
"ye" -> 'є',
34+
"yi" -> 'ї',
35+
"yu" -> 'ю',
36+
37+
"ch" -> 'ч',
38+
"kh" -> 'х',
39+
"sh" -> 'ш',
40+
"ts" -> 'ц',
41+
"zh" -> 'ж'
4542
)
4643

4744
val triGrams = Map(
48-
"aya" -> "ая",
49-
"aye" -> "ає",
50-
"ayi" -> "аї",
51-
"ayu" -> "аю",
52-
53-
"eya" -> "ея",
54-
"eye" -> "еє",
55-
"eyi" -> "еї",
56-
"eyu" -> "ею",
57-
58-
"iya" -> "ія",
59-
"iye" -> "іє",
60-
"iyi" -> "ії",
61-
"iyu" -> "ію",
62-
63-
"yya" -> "ия",
64-
"yye" -> "иє",
65-
"yyi" -> "иї",
66-
"yyu" -> "ию",
67-
68-
"zgh" -> "зг"
45+
"zgh" -> 'г'
6946
)
7047

7148
val fourGrams = Map(
72-
"shch" -> "щ"
49+
"shch" -> 'щ'
7350
)
7451

7552
val apostrophePatterns = Set(
@@ -100,56 +77,52 @@ object Ukrainian {
10077
('z', "yi")
10178
)
10279

103-
def restoreCase(str: String, cyrillic: String): String =
104-
if (str.forall(_.isUpper)) cyrillic.toUpperCase
105-
else if (str(0).isUpper) cyrillic.capitalize
106-
else cyrillic
107-
108-
def latinToCyrillic(text: String, apostrophes: Boolean = true): String = {
109-
val result = new StringBuilder(text.length)
110-
111-
var i = 0
112-
while (i < text.length) {
113-
if (i + 4 <= text.length && fourGrams.contains(text.substring(i, i + 4).toLowerCase)) {
114-
val cyrillic = fourGrams(text.substring(i, i + 4).toLowerCase)
115-
result.append(restoreCase(text.substring(i, i + 4), cyrillic))
116-
i += 4
117-
} else if (i + 3 <= text.length && triGrams.contains(text.substring(i, i + 3).toLowerCase)) {
118-
val cyrillic = triGrams(text.substring(i, i + 3).toLowerCase)
119-
result.append(restoreCase(text.substring(i, i + 3), cyrillic))
120-
i += 3
121-
} else if (i + 2 <= text.length && biGrams.contains(text.substring(i, i + 2).toLowerCase)) {
122-
val cyrillic = biGrams(text.substring(i, i + 2).toLowerCase)
123-
result.append(restoreCase(text.substring(i, i + 2), cyrillic))
124-
i += 2
125-
} else if ('c' == text(i).toLower) {
126-
// Skip Latin `c` to avoid confusion as its Cyrillic counterpart has a
127-
// different byte code
128-
i += 1
129-
} else if (uniGrams.contains(text(i).toLower)) {
130-
val cyrillic = uniGrams(text(i).toLower)
131-
result.append(if (text(i).isUpper) cyrillic.toUpper else cyrillic)
132-
i += 1
133-
} else if (text(i) == '\'') {
134-
if (apostrophes) {
135-
val last = if (i >= 1) text(i - 1).toLower else '\u0000'
136-
val nextTwo = text.slice(i + 1, i + 3).toLowerCase
137-
val cyrillic =
138-
if (apostrophePatterns.contains((last, nextTwo))) '\'' else 'ь'
139-
140-
result.append(
141-
if (i > 0 && text(i - 1).isUpper &&
142-
!(i == 1 || (i > 1 && text(i - 2).isWhitespace))
143-
) cyrillic.toUpper else cyrillic)
144-
}
145-
146-
i += 1
147-
} else {
148-
result.append(text(i))
149-
i += 1
150-
}
80+
/**
81+
* Converts one character starting from `offset`
82+
*
83+
* @return (-2, c) Replace last two characters by `c`
84+
* (-1, c) Replace last character by `c`
85+
* ( 0, c) Append character `c`
86+
*/
87+
def latinToCyrillicOfs(text: String,
88+
offset: Int,
89+
apostrophes: Boolean = true): (Int, Char) = {
90+
val ofs = offset + 1
91+
if (ofs >= 4 &&
92+
fourGrams.contains(text.substring(ofs - 4, ofs).toLowerCase)
93+
) {
94+
val chars = text.substring(ofs - 4, ofs)
95+
val cyrillic = fourGrams(chars.toLowerCase)
96+
(-2, restoreCaseFirst(chars, cyrillic))
97+
} else if (ofs >= 3 &&
98+
triGrams.contains(text.substring(ofs - 3, ofs).toLowerCase)
99+
) {
100+
val chars = text.substring(ofs - 3, ofs)
101+
val cyrillic = triGrams(chars.toLowerCase)
102+
(-1, restoreCaseAll(chars, cyrillic))
103+
} else if (ofs >= 2 &&
104+
biGrams.contains(text.substring(ofs - 2, ofs).toLowerCase)
105+
) {
106+
val chars = text.substring(ofs - 2, ofs)
107+
val cyrillic = biGrams(chars.toLowerCase)
108+
(-1, restoreCaseFirst(chars, cyrillic))
109+
} else if (uniGrams.contains(text(ofs - 1).toLower)) {
110+
val cyrillic = uniGrams(text(ofs - 1).toLower)
111+
(0, if (text(ofs - 1).isUpper) cyrillic.toUpper else cyrillic)
112+
} else if (ofs >= 2 && text(ofs - 1) == '\'' && apostrophes) {
113+
val last = if (ofs >= 1) text(ofs - 2).toLower else '\u0000'
114+
val nextTwo = text.slice(ofs, ofs + 2).toLowerCase
115+
val cyrillic =
116+
if (apostrophePatterns.contains((last, nextTwo))) '\'' else 'ь'
117+
val result = if (text(ofs - 2).isUpper) cyrillic.toUpper else cyrillic
118+
119+
(0, result)
120+
} else if ('c' == text(ofs - 1).toLower) {
121+
// Replace Latin `c` to avoid confusion as its Cyrillic counterpart has a
122+
// different byte code
123+
(0, 'ø')
124+
} else {
125+
(0, text(ofs - 1))
151126
}
152-
153-
result.mkString
154127
}
155128
}

0 commit comments

Comments
 (0)