@@ -47,21 +47,20 @@ object Ukrainian extends Language {
4747 " ts" -> 'ц' ,
4848 " zh" -> 'ж'
4949 )
50-
51- val biGramsIncremental = getIncrementalNgram(biGrams)
50+ val biGramsIncremental = incrementalNgram(biGrams)
5251
5352 val triGrams = Map (
5453 " zgh" -> 'г' ,
5554 )
56-
57- val triGramsIncremental = getIncrementalNgram(triGrams) + (" шцh" -> 'щ' )
55+ val triGramsIncremental = incrementalNgram(triGrams) ++ Map (
56+ " шцh" -> 'щ' ,
57+ " зґh" -> 'г'
58+ )
5859
5960 val fourGrams = Map (
6061 " shch" -> 'щ'
6162 )
6263
63- val fourGramIncremental = Map .empty[String , Char ]
64-
6564 val apostrophePatterns = Set (
6665 ('b' , " ya" ),
6766 ('b' , " ye" ),
@@ -90,10 +89,6 @@ object Ukrainian extends Language {
9089 ('z' , " yi" )
9190 )
9291
93- def getIncrementalNgram (ngram : Map [String , Char ]): Map [String , Char ] = ngram ++ ngram.map { case (prefix, value) =>
94- (latinToCyrillic(prefix.slice(0 , prefix.length - 1 ), incrementalTranslit = true ) + prefix.last, value)
95- }
96-
9792 /**
9893 * Converts one character starting from `offset`
9994 *
@@ -104,24 +99,25 @@ object Ukrainian extends Language {
10499 def latinToCyrillicOfs (text : String ,
105100 offset : Int ,
106101 apostrophes : Boolean = true ,
107- incrementalTranslit : Boolean = false ): (Int , Char ) = {
108- val (biGramsL, triGramsL, fourGramsL ) =
109- if (incrementalTranslit ) (biGramsIncremental, triGramsIncremental, fourGramIncremental )
110- else (biGrams, triGrams, fourGrams )
102+ incremental : Boolean = false ): (Int , Char ) = {
103+ val (biGramsL, triGramsL) =
104+ if (incremental ) (biGramsIncremental, triGramsIncremental)
105+ else (biGrams, triGrams)
111106 val ofs = offset + 1
112107 if (ofs >= 4 &&
113- fourGramsL .contains(text.substring(ofs - 4 , ofs).toLowerCase)
108+ fourGrams .contains(text.substring(ofs - 4 , ofs).toLowerCase)
114109 ) {
115110 val chars = text.substring(ofs - 4 , ofs)
116- val cyrillic = fourGramsL (chars.toLowerCase)
111+ val cyrillic = fourGrams (chars.toLowerCase)
117112 (- 2 , restoreCaseFirst(chars, cyrillic))
118113 } else if (ofs >= 3 &&
119114 triGramsL.contains(text.substring(ofs - 3 , ofs).toLowerCase)
120115 ) {
121116 val chars = text.substring(ofs - 3 , ofs)
122117 val cyrillic = triGramsL(chars.toLowerCase)
123- val newOffset = if (chars == " шцh" ) - 2 else - 1
124- (newOffset, restoreCaseAll(chars, cyrillic))
118+ if (incremental && chars.equalsIgnoreCase(" шцh" ))
119+ (- 2 , restoreCaseFirst(chars, cyrillic))
120+ else (- 1 , restoreCaseAll(chars, cyrillic))
125121 } else if (ofs >= 2 &&
126122 biGramsL.contains(text.substring(ofs - 2 , ofs).toLowerCase)
127123 ) {
0 commit comments