@@ -5,7 +5,6 @@ import ch.randelshofer.fastdoubleparser.NumberFormatSymbols
5
5
import io.github.oshai.kotlinlogging.KotlinLogging
6
6
import org.jetbrains.kotlinx.dataframe.DataFrame
7
7
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
8
- import org.jetbrains.kotlinx.dataframe.api.parser
9
8
import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
10
9
import java.nio.charset.Charset
11
10
import java.text.DecimalFormatSymbols
@@ -15,19 +14,24 @@ import java.util.Locale
15
14
16
15
private val logger = KotlinLogging .logger {}
17
16
18
- // (lowercase) strings that are recognized to represent infinity and NaN in doubles in all locales
19
- private val INFINITIES = arrayOf(" ∞" , " inf" , " infinity" , " infty" )
20
- private val PLUS_INFINITIES = INFINITIES .map { " +$it " }
21
- private val MINUS_INFINITIES = INFINITIES .map { " -$it " }
22
- private val NANS = arrayOf(" nan" , " na" , " n/a" )
23
-
24
17
/* *
25
18
* Parses a [String]/[CharSequence], [CharArray], or [ByteArray] into a [Double].
26
19
*
27
20
* If [ParserOptions.useFastDoubleParser] is enabled, it will try to parse the input with the
28
21
* fast double parser library, [FastDoubleParser](https://github.com/wrandelshofer/FastDoubleParser).
29
22
* If not, or if it fails, it will use [NumberFormat] to parse the input.
30
23
*
24
+ * The [locale][locale] used by the double parser is defined like:
25
+ *
26
+ * [parserOptions][parserOptions]`?.`[locale][ParserOptions.locale]` ?: `[Parsers.locale][Parsers.locale]` :? `[Locale.getDefault()][Locale.getDefault]
27
+ *
28
+ * [FastDoubleParser] has a fallback mechanism; In practice, this means it can recognize symbols and notations
29
+ * of any locale recognized by Java as long as that symbol does not conflict with the given locale.
30
+ *
31
+ * For example, if your locale uses ',' as decimal symbol, it will NOT recognize ',' as thousands separator, but it will
32
+ * recognize ' ', '٬', '_', ' ', etc. as thousands separator.
33
+ * The same holds for characters like "e", "inf", "x10", "NaN", etc.
34
+ *
31
35
* Public, so it can be used in other modules.
32
36
*
33
37
* @param parserOptions can be supplied to configure the parser.
@@ -41,10 +45,8 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
41
45
42
46
private val useFastDoubleParser = parserOptions?.useFastDoubleParser ? : Parsers .useFastDoubleParser
43
47
private val locale = parserOptions?.locale ? : Parsers .locale
44
- private val fallbackLocale = Locale .ROOT
45
48
46
49
private val localDecimalFormatSymbols = DecimalFormatSymbols .getInstance(locale)
47
- private val fallbackDecimalFormatSymbols = DecimalFormatSymbols .getInstance(fallbackLocale)
48
50
49
51
private val parser = ConfigurableDoubleParser (/* symbols = */ setupNumberFormatSymbols(), /* ignoreCase = */ true )
50
52
@@ -75,71 +77,69 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
75
77
}
76
78
77
79
/* *
78
- * Builds a set with the specified char from [localDecimalFormatSymbols] and
79
- * its fallback char from [fallbackDecimalFormatSymbols] if it's safe to do so.
80
- * [additionals] will be added to the set too, when they're safe to add.
80
+ * Builds a set with the specified char from [this] and
81
+ * [fallbackChars] will be added to the set too, when they're safe to add.
81
82
*/
82
- fun (( DecimalFormatSymbols ) -> Char ).fromLocalWithFallBack( vararg additionals : Char ): Set <Char > =
83
+ fun Char. withFallback ( fallbackChars : CharArray ): Set <Char > =
83
84
buildSet {
84
- val getChar = this @fromLocalWithFallBack
85
- val char = getChar(localDecimalFormatSymbols).lowercaseChar()
85
+ val char = this @withFallback.lowercaseChar()
86
86
add(char)
87
87
88
- // add fallback char if it's safe to do so
89
- val fallbackChar = getChar(fallbackDecimalFormatSymbols).lowercaseChar()
90
- if (fallbackChar !in localChars && ! localStrings.any { fallbackChar in it }) {
91
- add(fallbackChar)
92
- }
93
-
94
- // Fixes NBSP and other whitespace characters not being recognized if the user writes space instead.
95
- if (char.isWhitespace()) add(' ' )
88
+ // Treat NBSP and other whitespace characters the same.
89
+ if (char.isWhitespace()) addAll(WHITE_SPACES .asIterable())
96
90
97
- // add additional chars if needed
98
- for (additional in additionals ) {
99
- val lowercase = additional .lowercaseChar()
91
+ // add fallback chars if needed
92
+ for (char in fallbackChars ) {
93
+ val lowercase = char .lowercaseChar()
100
94
if (lowercase !in localChars && ! localStrings.any { lowercase in it }) {
101
95
add(lowercase)
102
96
}
97
+
98
+ // Treat NBSP and other whitespace characters the same.
99
+ if (char.isWhitespace()) addAll(WHITE_SPACES .asIterable())
103
100
}
104
101
}
105
102
106
103
/* *
107
- * Builds a set with the specified string from [localDecimalFormatSymbols] and
108
- * its fallback string from [fallbackDecimalFormatSymbols] if it's safe to do so.
109
- * [additionals] will be added to the set too, when they're safe to add.
104
+ * Builds a set with the specified string from [this] and
105
+ * [fallbackStrings] will be added to the set too, when they're safe to add.
110
106
*/
111
- fun (( DecimalFormatSymbols ) -> String ).fromLocalWithFallBack( vararg additionals : String ): Set <String > =
107
+ fun String. withFallback ( fallbackStrings : Array < String > ): Set <String > =
112
108
buildSet {
113
- val getString = this @fromLocalWithFallBack
114
- val string = getString(localDecimalFormatSymbols).lowercase()
109
+ val string = this @withFallback.lowercase()
115
110
add(string)
116
111
117
- // add fallback string if it's safe to do so
118
- val fallbackString = getString(fallbackDecimalFormatSymbols).lowercase()
119
- if (! fallbackString.any { it in localChars } && fallbackString !in localStrings) {
120
- add(fallbackString)
121
- }
122
-
123
- // Fixes NBSP and other whitespace characters not being recognized if the user writes space instead.
124
- if (string.isBlank()) add(" " )
112
+ // Treat NBSP and other whitespace characters the same.
113
+ if (string.isBlank()) addAll(WHITE_SPACES .map { it.toString() })
125
114
126
- // add additional strings if needed
127
- for (additional in additionals ) {
128
- val lowercase = additional .lowercase()
115
+ // add fallback strings if needed
116
+ for (string in fallbackStrings ) {
117
+ val lowercase = string .lowercase()
129
118
if (! lowercase.any { it in localChars } && lowercase !in localStrings) {
130
119
add(lowercase)
131
120
}
121
+
122
+ // Treat NBSP and other whitespace characters the same.
123
+ if (string.isBlank()) addAll(WHITE_SPACES .map { it.toString() })
132
124
}
133
125
}
134
126
135
127
return NumberFormatSymbols .fromDecimalFormatSymbols(localDecimalFormatSymbols)
136
- .withPlusSign(setOf (' +' ))
137
- .withDecimalSeparator(DecimalFormatSymbols ::getDecimalSeparator.fromLocalWithFallBack())
138
- .withGroupingSeparator(DecimalFormatSymbols ::getGroupingSeparator.fromLocalWithFallBack())
139
- .withExponentSeparator(DecimalFormatSymbols ::getExponentSeparator.fromLocalWithFallBack())
140
- .withMinusSign(DecimalFormatSymbols ::getMinusSign.fromLocalWithFallBack())
141
- .withInfinity(DecimalFormatSymbols ::getInfinity.fromLocalWithFallBack(* INFINITIES ))
142
- .withNaN(DecimalFormatSymbols ::getNaN.fromLocalWithFallBack(* NANS ))
128
+ .withPlusSign(
129
+ setOf (' +' ),
130
+ ).withDecimalSeparator(
131
+ localDecimalFormatSymbols.decimalSeparator.withFallback(DECIMAL_SEPARATORS ),
132
+ ).withGroupingSeparator(
133
+ localDecimalFormatSymbols.groupingSeparator.withFallback(GROUPING_SEPARATORS ),
134
+ ).withExponentSeparator(
135
+ localDecimalFormatSymbols.exponentSeparator.withFallback(EXPONENTS ),
136
+ ).withMinusSign(
137
+ localDecimalFormatSymbols.minusSign.withFallback(MINUS_SIGNS ),
138
+ ).withInfinity(
139
+ localDecimalFormatSymbols.infinity.withFallback(INFINITIES ),
140
+ ).withNaN(
141
+ localDecimalFormatSymbols.naN.withFallback(NANS ),
142
+ )
143
143
}
144
144
145
145
/* * Fallback method for parsing doubles. */
@@ -152,7 +152,7 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
152
152
in NANS -> Double .NaN
153
153
154
154
else -> {
155
- // not thread safe; must be created here
155
+ // NumberFormat is not thread safe; must be created in the function body
156
156
val numberFormat = NumberFormat .getInstance(locale)
157
157
val parsePosition = ParsePosition (0 )
158
158
val result = numberFormat.parse(this , parsePosition)?.toDouble()
@@ -235,4 +235,47 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
235
235
}
236
236
return String (chars = ca, offset = offset, length = length).parseToDoubleOrNullFallback()
237
237
}
238
+
239
+ /* *
240
+ * Here we store all possible decimal format symbols of all locales on the system.
241
+ * These will be used as fallbacks for the selected locale.
242
+ * They are only added by [withFallback] if they don't interfere with symbols already in the provided [locale]
243
+ * (so ',' is not added as grouping separator if '.' is already the locale's decimal separator).
244
+ */
245
+ internal companion object {
246
+ private val allDecimalFormatSymbols by lazy {
247
+ Locale .getAvailableLocales().map { DecimalFormatSymbols .getInstance(it) }
248
+ }
249
+ val MINUS_SIGNS by lazy {
250
+ allDecimalFormatSymbols.mapNotNullTo(mutableSetOf ()) { it.minusSign }.toCharArray()
251
+ }
252
+
253
+ // (lowercase) strings that are recognized to represent infinity and NaN in doubles in all locales
254
+ val INFINITIES by lazy {
255
+ allDecimalFormatSymbols.mapNotNullTo(mutableSetOf ()) { it.infinity }
256
+ .plus(arrayOf(" ∞" , " inf" , " infinity" , " infty" ))
257
+ .toTypedArray()
258
+ }
259
+ val PLUS_INFINITIES by lazy { INFINITIES .map { " +$it " }.toTypedArray() }
260
+ val MINUS_INFINITIES by lazy {
261
+ INFINITIES .flatMap { inf -> MINUS_SIGNS .map { min -> min + inf } }.toTypedArray()
262
+ }
263
+ val NANS by lazy {
264
+ allDecimalFormatSymbols.mapNotNullTo(mutableSetOf ()) { it.naN }
265
+ .plus(arrayOf(" nan" , " na" , " n/a" ))
266
+ .toTypedArray()
267
+ }
268
+ val GROUPING_SEPARATORS by lazy {
269
+ allDecimalFormatSymbols.mapNotNullTo(mutableSetOf ()) { it.groupingSeparator }.toCharArray()
270
+ }
271
+ val DECIMAL_SEPARATORS by lazy {
272
+ allDecimalFormatSymbols.flatMapTo(mutableSetOf ()) {
273
+ listOfNotNull(it.decimalSeparator, it.monetaryDecimalSeparator)
274
+ }.toCharArray()
275
+ }
276
+ val EXPONENTS by lazy {
277
+ allDecimalFormatSymbols.mapNotNullTo(mutableSetOf ()) { it.exponentSeparator }.toTypedArray()
278
+ }
279
+ val WHITE_SPACES = charArrayOf(' ' , ' \u00A0 ' , ' \u2009 ' , ' \u202F ' , ' \t ' )
280
+ }
238
281
}
0 commit comments