Skip to content

Commit ad272d8

Browse files
committed
Improved fallback mechanism of FastDoubleParser to take into account all other locales, not just ROOT. Finished parse tests
1 parent 6594189 commit ad272d8

File tree

4 files changed

+216
-63
lines changed

4 files changed

+216
-63
lines changed

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ import org.jetbrains.kotlinx.dataframe.impl.api.toLocalDateTime
3636
import org.jetbrains.kotlinx.dataframe.impl.api.toLocalTime
3737
import org.jetbrains.kotlinx.dataframe.impl.api.withRowCellImpl
3838
import org.jetbrains.kotlinx.dataframe.impl.headPlusArray
39-
import org.jetbrains.kotlinx.dataframe.io.toDataFrame
4039
import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser
40+
import org.jetbrains.kotlinx.dataframe.io.toDataFrame
4141
import java.math.BigDecimal
4242
import java.math.BigInteger
4343
import java.net.URL

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion
3131
import org.jetbrains.kotlinx.dataframe.columns.size
3232
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException
3333
import org.jetbrains.kotlinx.dataframe.hasNulls
34-
import org.jetbrains.kotlinx.dataframe.impl.asNullable
3534
import org.jetbrains.kotlinx.dataframe.impl.canParse
3635
import org.jetbrains.kotlinx.dataframe.impl.catchSilent
3736
import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt

Lines changed: 94 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import ch.randelshofer.fastdoubleparser.NumberFormatSymbols
55
import io.github.oshai.kotlinlogging.KotlinLogging
66
import org.jetbrains.kotlinx.dataframe.DataFrame
77
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
8-
import org.jetbrains.kotlinx.dataframe.api.parser
98
import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
109
import java.nio.charset.Charset
1110
import java.text.DecimalFormatSymbols
@@ -15,19 +14,24 @@ import java.util.Locale
1514

1615
private val logger = KotlinLogging.logger {}
1716

18-
// (lowercase) strings that are recognized to represent infinity and NaN in doubles in all locales
19-
private val INFINITIES = arrayOf("", "inf", "infinity", "infty")
20-
private val PLUS_INFINITIES = INFINITIES.map { "+$it" }
21-
private val MINUS_INFINITIES = INFINITIES.map { "-$it" }
22-
private val NANS = arrayOf("nan", "na", "n/a")
23-
2417
/**
2518
* Parses a [String]/[CharSequence], [CharArray], or [ByteArray] into a [Double].
2619
*
2720
* If [ParserOptions.useFastDoubleParser] is enabled, it will try to parse the input with the
2821
* fast double parser library, [FastDoubleParser](https://github.com/wrandelshofer/FastDoubleParser).
2922
* If not, or if it fails, it will use [NumberFormat] to parse the input.
3023
*
24+
* The [locale][locale] used by the double parser is defined like:
25+
*
26+
* [parserOptions][parserOptions]`?.`[locale][ParserOptions.locale]` ?: `[Parsers.locale][Parsers.locale]` :? `[Locale.getDefault()][Locale.getDefault]
27+
*
28+
* [FastDoubleParser] has a fallback mechanism; In practice, this means it can recognize symbols and notations
29+
* of any locale recognized by Java as long as that symbol does not conflict with the given locale.
30+
*
31+
* For example, if your locale uses ',' as decimal symbol, it will NOT recognize ',' as thousands separator, but it will
32+
* recognize ' ', '٬', '_', ' ', etc. as thousands separator.
33+
* The same holds for characters like "e", "inf", "x10", "NaN", etc.
34+
*
3135
* Public, so it can be used in other modules.
3236
*
3337
* @param parserOptions can be supplied to configure the parser.
@@ -41,10 +45,8 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
4145

4246
private val useFastDoubleParser = parserOptions?.useFastDoubleParser ?: Parsers.useFastDoubleParser
4347
private val locale = parserOptions?.locale ?: Parsers.locale
44-
private val fallbackLocale = Locale.ROOT
4548

4649
private val localDecimalFormatSymbols = DecimalFormatSymbols.getInstance(locale)
47-
private val fallbackDecimalFormatSymbols = DecimalFormatSymbols.getInstance(fallbackLocale)
4850

4951
private val parser = ConfigurableDoubleParser(/* symbols = */ setupNumberFormatSymbols(), /* ignoreCase = */ true)
5052

@@ -75,71 +77,69 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
7577
}
7678

7779
/**
78-
* Builds a set with the specified char from [localDecimalFormatSymbols] and
79-
* its fallback char from [fallbackDecimalFormatSymbols] if it's safe to do so.
80-
* [additionals] will be added to the set too, when they're safe to add.
80+
* Builds a set with the specified char from [this] and
81+
* [fallbackChars] will be added to the set too, when they're safe to add.
8182
*/
82-
fun ((DecimalFormatSymbols) -> Char).fromLocalWithFallBack(vararg additionals: Char): Set<Char> =
83+
fun Char.withFallback(fallbackChars: CharArray): Set<Char> =
8384
buildSet {
84-
val getChar = this@fromLocalWithFallBack
85-
val char = getChar(localDecimalFormatSymbols).lowercaseChar()
85+
val char = this@withFallback.lowercaseChar()
8686
add(char)
8787

88-
// add fallback char if it's safe to do so
89-
val fallbackChar = getChar(fallbackDecimalFormatSymbols).lowercaseChar()
90-
if (fallbackChar !in localChars && !localStrings.any { fallbackChar in it }) {
91-
add(fallbackChar)
92-
}
93-
94-
// Fixes NBSP and other whitespace characters not being recognized if the user writes space instead.
95-
if (char.isWhitespace()) add(' ')
88+
// Treat NBSP and other whitespace characters the same.
89+
if (char.isWhitespace()) addAll(WHITE_SPACES.asIterable())
9690

97-
// add additional chars if needed
98-
for (additional in additionals) {
99-
val lowercase = additional.lowercaseChar()
91+
// add fallback chars if needed
92+
for (char in fallbackChars) {
93+
val lowercase = char.lowercaseChar()
10094
if (lowercase !in localChars && !localStrings.any { lowercase in it }) {
10195
add(lowercase)
10296
}
97+
98+
// Treat NBSP and other whitespace characters the same.
99+
if (char.isWhitespace()) addAll(WHITE_SPACES.asIterable())
103100
}
104101
}
105102

106103
/**
107-
* Builds a set with the specified string from [localDecimalFormatSymbols] and
108-
* its fallback string from [fallbackDecimalFormatSymbols] if it's safe to do so.
109-
* [additionals] will be added to the set too, when they're safe to add.
104+
* Builds a set with the specified string from [this] and
105+
* [fallbackStrings] will be added to the set too, when they're safe to add.
110106
*/
111-
fun ((DecimalFormatSymbols) -> String).fromLocalWithFallBack(vararg additionals: String): Set<String> =
107+
fun String.withFallback(fallbackStrings: Array<String>): Set<String> =
112108
buildSet {
113-
val getString = this@fromLocalWithFallBack
114-
val string = getString(localDecimalFormatSymbols).lowercase()
109+
val string = this@withFallback.lowercase()
115110
add(string)
116111

117-
// add fallback string if it's safe to do so
118-
val fallbackString = getString(fallbackDecimalFormatSymbols).lowercase()
119-
if (!fallbackString.any { it in localChars } && fallbackString !in localStrings) {
120-
add(fallbackString)
121-
}
122-
123-
// Fixes NBSP and other whitespace characters not being recognized if the user writes space instead.
124-
if (string.isBlank()) add(" ")
112+
// Treat NBSP and other whitespace characters the same.
113+
if (string.isBlank()) addAll(WHITE_SPACES.map { it.toString() })
125114

126-
// add additional strings if needed
127-
for (additional in additionals) {
128-
val lowercase = additional.lowercase()
115+
// add fallback strings if needed
116+
for (string in fallbackStrings) {
117+
val lowercase = string.lowercase()
129118
if (!lowercase.any { it in localChars } && lowercase !in localStrings) {
130119
add(lowercase)
131120
}
121+
122+
// Treat NBSP and other whitespace characters the same.
123+
if (string.isBlank()) addAll(WHITE_SPACES.map { it.toString() })
132124
}
133125
}
134126

135127
return NumberFormatSymbols.fromDecimalFormatSymbols(localDecimalFormatSymbols)
136-
.withPlusSign(setOf('+'))
137-
.withDecimalSeparator(DecimalFormatSymbols::getDecimalSeparator.fromLocalWithFallBack())
138-
.withGroupingSeparator(DecimalFormatSymbols::getGroupingSeparator.fromLocalWithFallBack())
139-
.withExponentSeparator(DecimalFormatSymbols::getExponentSeparator.fromLocalWithFallBack())
140-
.withMinusSign(DecimalFormatSymbols::getMinusSign.fromLocalWithFallBack())
141-
.withInfinity(DecimalFormatSymbols::getInfinity.fromLocalWithFallBack(*INFINITIES))
142-
.withNaN(DecimalFormatSymbols::getNaN.fromLocalWithFallBack(*NANS))
128+
.withPlusSign(
129+
setOf('+'),
130+
).withDecimalSeparator(
131+
localDecimalFormatSymbols.decimalSeparator.withFallback(DECIMAL_SEPARATORS),
132+
).withGroupingSeparator(
133+
localDecimalFormatSymbols.groupingSeparator.withFallback(GROUPING_SEPARATORS),
134+
).withExponentSeparator(
135+
localDecimalFormatSymbols.exponentSeparator.withFallback(EXPONENTS),
136+
).withMinusSign(
137+
localDecimalFormatSymbols.minusSign.withFallback(MINUS_SIGNS),
138+
).withInfinity(
139+
localDecimalFormatSymbols.infinity.withFallback(INFINITIES),
140+
).withNaN(
141+
localDecimalFormatSymbols.naN.withFallback(NANS),
142+
)
143143
}
144144

145145
/** Fallback method for parsing doubles. */
@@ -152,7 +152,7 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
152152
in NANS -> Double.NaN
153153

154154
else -> {
155-
// not thread safe; must be created here
155+
// NumberFormat is not thread safe; must be created in the function body
156156
val numberFormat = NumberFormat.getInstance(locale)
157157
val parsePosition = ParsePosition(0)
158158
val result = numberFormat.parse(this, parsePosition)?.toDouble()
@@ -235,4 +235,47 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
235235
}
236236
return String(chars = ca, offset = offset, length = length).parseToDoubleOrNullFallback()
237237
}
238+
239+
/**
240+
* Here we store all possible decimal format symbols of all locales on the system.
241+
* These will be used as fallbacks for the selected locale.
242+
* They are only added by [withFallback] if they don't interfere with symbols already in the provided [locale]
243+
* (so ',' is not added as grouping separator if '.' is already the locale's decimal separator).
244+
*/
245+
internal companion object {
246+
private val allDecimalFormatSymbols by lazy {
247+
Locale.getAvailableLocales().map { DecimalFormatSymbols.getInstance(it) }
248+
}
249+
val MINUS_SIGNS by lazy {
250+
allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.minusSign }.toCharArray()
251+
}
252+
253+
// (lowercase) strings that are recognized to represent infinity and NaN in doubles in all locales
254+
val INFINITIES by lazy {
255+
allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.infinity }
256+
.plus(arrayOf("", "inf", "infinity", "infty"))
257+
.toTypedArray()
258+
}
259+
val PLUS_INFINITIES by lazy { INFINITIES.map { "+$it" }.toTypedArray() }
260+
val MINUS_INFINITIES by lazy {
261+
INFINITIES.flatMap { inf -> MINUS_SIGNS.map { min -> min + inf } }.toTypedArray()
262+
}
263+
val NANS by lazy {
264+
allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.naN }
265+
.plus(arrayOf("nan", "na", "n/a"))
266+
.toTypedArray()
267+
}
268+
val GROUPING_SEPARATORS by lazy {
269+
allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.groupingSeparator }.toCharArray()
270+
}
271+
val DECIMAL_SEPARATORS by lazy {
272+
allDecimalFormatSymbols.flatMapTo(mutableSetOf()) {
273+
listOfNotNull(it.decimalSeparator, it.monetaryDecimalSeparator)
274+
}.toCharArray()
275+
}
276+
val EXPONENTS by lazy {
277+
allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.exponentSeparator }.toTypedArray()
278+
}
279+
val WHITE_SPACES = charArrayOf(' ', '\u00A0', '\u2009', '\u202F', '\t')
280+
}
238281
}

0 commit comments

Comments
 (0)