Skip to content

Commit a0c4f85

Browse files
[libc] Change ctype to be encoding independent (llvm#110574)
The previous implementation of the ctype functions assumed ASCII. This patch changes to a switch/case implementation that looks odd, but actually is easier for the compiler to understand and optimize.
1 parent e0ae779 commit a0c4f85

33 files changed

+915
-185
lines changed

libc/src/__support/ctype_utils.h

Lines changed: 546 additions & 23 deletions
Large diffs are not rendered by default.

libc/src/__support/high_precision_decimal.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,9 +178,11 @@ class HighPrecisionDecimal {
178178
if (digit_index >= this->num_digits) {
179179
return new_digits - 1;
180180
}
181-
if (this->digits[digit_index] != power_of_five[digit_index] - '0') {
181+
if (this->digits[digit_index] !=
182+
internal::b36_char_to_int(power_of_five[digit_index])) {
182183
return new_digits -
183-
((this->digits[digit_index] < power_of_five[digit_index] - '0')
184+
((this->digits[digit_index] <
185+
internal::b36_char_to_int(power_of_five[digit_index]))
184186
? 1
185187
: 0);
186188
}
@@ -337,8 +339,8 @@ class HighPrecisionDecimal {
337339
}
338340
++total_digits;
339341
if (this->num_digits < MAX_NUM_DIGITS) {
340-
this->digits[this->num_digits] =
341-
static_cast<uint8_t>(num_string[num_cur] - '0');
342+
this->digits[this->num_digits] = static_cast<uint8_t>(
343+
internal::b36_char_to_int(num_string[num_cur]));
342344
++this->num_digits;
343345
} else if (num_string[num_cur] != '0') {
344346
this->truncated = true;

libc/src/__support/integer_literals.h

Lines changed: 8 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,13 @@
1313
#ifndef LLVM_LIBC_SRC___SUPPORT_INTEGER_LITERALS_H
1414
#define LLVM_LIBC_SRC___SUPPORT_INTEGER_LITERALS_H
1515

16-
#include "src/__support/CPP/limits.h" // CHAR_BIT
16+
#include "src/__support/CPP/limits.h" // CHAR_BIT
17+
#include "src/__support/ctype_utils.h"
1718
#include "src/__support/macros/attributes.h" // LIBC_INLINE
1819
#include "src/__support/macros/config.h"
19-
#include "src/__support/uint128.h" // UInt128
20-
#include <stddef.h> // size_t
21-
#include <stdint.h> // uintxx_t
20+
#include "src/__support/uint128.h" // UInt128
21+
#include <stddef.h> // size_t
22+
#include <stdint.h> // uintxx_t
2223

2324
namespace LIBC_NAMESPACE_DECL {
2425

@@ -75,26 +76,13 @@ template <typename T, int base> struct DigitBuffer {
7576
push(*str);
7677
}
7778

78-
// Returns the digit for a particular character.
79-
// Returns INVALID_DIGIT if the character is invalid.
80-
LIBC_INLINE static constexpr uint8_t get_digit_value(const char c) {
81-
const auto to_lower = [](char c) { return c | 32; };
82-
const auto is_digit = [](char c) { return c >= '0' && c <= '9'; };
83-
const auto is_alpha = [](char c) {
84-
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
85-
};
86-
if (is_digit(c))
87-
return static_cast<uint8_t>(c - '0');
88-
if (base > 10 && is_alpha(c))
89-
return static_cast<uint8_t>(to_lower(c) - 'a' + 10);
90-
return INVALID_DIGIT;
91-
}
92-
9379
// Adds a single character to this buffer.
9480
LIBC_INLINE constexpr void push(char c) {
9581
if (c == '\'')
9682
return; // ' is valid but not taken into account.
97-
const uint8_t value = get_digit_value(c);
83+
const int b36_val = internal::b36_char_to_int(c);
84+
const uint8_t value = static_cast<uint8_t>(
85+
b36_val < base && (b36_val != 0 || c == '0') ? b36_val : INVALID_DIGIT);
9886
if (value == INVALID_DIGIT || size >= MAX_DIGITS) {
9987
// During constant evaluation `__builtin_unreachable` will halt the
10088
// compiler as it is not executable. This is preferable over `assert` that

libc/src/__support/integer_to_string.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
#include "src/__support/CPP/type_traits.h"
7070
#include "src/__support/big_int.h" // make_integral_or_big_int_unsigned_t
7171
#include "src/__support/common.h"
72+
#include "src/__support/ctype_utils.h"
7273
#include "src/__support/macros/config.h"
7374

7475
namespace LIBC_NAMESPACE_DECL {
@@ -214,9 +215,9 @@ template <typename T, typename Fmt = radix::Dec> class IntegerToString {
214215
using UNSIGNED_T = make_integral_or_big_int_unsigned_t<T>;
215216

216217
LIBC_INLINE static char digit_char(uint8_t digit) {
217-
if (digit < 10)
218-
return '0' + static_cast<char>(digit);
219-
return (Fmt::IS_UPPERCASE ? 'A' : 'a') + static_cast<char>(digit - 10);
218+
const int result = internal::int_to_b36_char(digit);
219+
return static_cast<char>(Fmt::IS_UPPERCASE ? internal::toupper(result)
220+
: result);
220221
}
221222

222223
LIBC_INLINE static void

libc/src/__support/str_to_float.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -909,7 +909,7 @@ decimal_string_to_float(const char *__restrict src, const char DECIMAL_POINT,
909909
cpp::numeric_limits<StorageType>::max() / BASE;
910910
while (true) {
911911
if (isdigit(src[index])) {
912-
uint32_t digit = src[index] - '0';
912+
uint32_t digit = b36_char_to_int(src[index]);
913913
seen_digit = true;
914914

915915
if (mantissa < bitstype_max_div_by_base) {

libc/src/__support/str_to_integer.h

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,22 +42,14 @@ first_non_whitespace(const char *__restrict src,
4242
return src + src_cur;
4343
}
4444

45-
LIBC_INLINE int b36_char_to_int(char input) {
46-
if (isdigit(input))
47-
return input - '0';
48-
if (isalpha(input))
49-
return (input | 32) + 10 - 'a';
50-
return 0;
51-
}
52-
5345
// checks if the next 3 characters of the string pointer are the start of a
5446
// hexadecimal number. Does not advance the string pointer.
5547
LIBC_INLINE bool
5648
is_hex_start(const char *__restrict src,
5749
size_t src_len = cpp::numeric_limits<size_t>::max()) {
5850
if (src_len < 3)
5951
return false;
60-
return *src == '0' && (*(src + 1) | 32) == 'x' && isalnum(*(src + 2)) &&
52+
return *src == '0' && tolower(*(src + 1)) == 'x' && isalnum(*(src + 2)) &&
6153
b36_char_to_int(*(src + 2)) < 16;
6254
}
6355

libc/src/ctype/isxdigit.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ namespace LIBC_NAMESPACE_DECL {
1616

1717
LLVM_LIBC_FUNCTION(int, isxdigit, (int c)) {
1818
const unsigned ch = static_cast<unsigned>(c);
19-
return static_cast<int>(internal::isdigit(ch) || (ch | 32) - 'a' < 6);
19+
return static_cast<int>(internal::isalnum(ch) &&
20+
internal::b36_char_to_int(ch) < 16);
2021
}
2122

2223
} // namespace LIBC_NAMESPACE_DECL

libc/src/ctype/isxdigit_l.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ namespace LIBC_NAMESPACE_DECL {
1616

1717
LLVM_LIBC_FUNCTION(int, isxdigit_l, (int c, locale_t)) {
1818
const unsigned ch = static_cast<unsigned>(c);
19-
return static_cast<int>(internal::isdigit(ch) || (ch | 32) - 'a' < 6);
19+
return static_cast<int>(internal::isalnum(ch) &&
20+
internal::b36_char_to_int(ch) < 16);
2021
}
2122

2223
} // namespace LIBC_NAMESPACE_DECL

libc/src/ctype/toupper.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,6 @@
1414

1515
namespace LIBC_NAMESPACE_DECL {
1616

17-
LLVM_LIBC_FUNCTION(int, toupper, (int c)) {
18-
if (internal::islower(c))
19-
return c - ('a' - 'A');
20-
return c;
21-
}
17+
LLVM_LIBC_FUNCTION(int, toupper, (int c)) { return internal::toupper(c); }
2218

2319
} // namespace LIBC_NAMESPACE_DECL

libc/src/ctype/toupper_l.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,7 @@
1515
namespace LIBC_NAMESPACE_DECL {
1616

1717
LLVM_LIBC_FUNCTION(int, toupper_l, (int c, locale_t)) {
18-
if (internal::islower(c))
19-
return c - ('a' - 'A');
20-
return c;
18+
return internal::toupper(c);
2119
}
2220

2321
} // namespace LIBC_NAMESPACE_DECL

libc/src/stdio/printf_core/fixed_converter.h

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include "include/llvm-libc-macros/stdfix-macros.h"
1313
#include "src/__support/CPP/string_view.h"
14+
#include "src/__support/ctype_utils.h"
1415
#include "src/__support/fixed_point/fx_bits.h"
1516
#include "src/__support/fixed_point/fx_rep.h"
1617
#include "src/__support/integer_to_string.h"
@@ -68,10 +69,6 @@ LIBC_INLINE int convert_fixed(Writer *writer, const FormatSection &to_conv) {
6869
using LARep = fixed_point::FXRep<unsigned long accum>;
6970
using StorageType = LARep::StorageType;
7071

71-
// All of the letters will be defined relative to variable a, which will be
72-
// the appropriate case based on the name of the conversion. This converts any
73-
// conversion name into the letter 'a' with the appropriate case.
74-
const char a = (to_conv.conv_name & 32) | 'A';
7572
FormatFlags flags = to_conv.flags;
7673

7774
bool is_negative;
@@ -179,9 +176,9 @@ LIBC_INLINE int convert_fixed(Writer *writer, const FormatSection &to_conv) {
179176
// unspecified.
180177
RoundDirection round;
181178
char first_digit_after = fraction_digits[precision];
182-
if (first_digit_after > '5') {
179+
if (internal::b36_char_to_int(first_digit_after) > 5) {
183180
round = RoundDirection::Up;
184-
} else if (first_digit_after < '5') {
181+
} else if (internal::b36_char_to_int(first_digit_after) < 5) {
185182
round = RoundDirection::Down;
186183
} else {
187184
// first_digit_after == '5'
@@ -204,7 +201,8 @@ LIBC_INLINE int convert_fixed(Writer *writer, const FormatSection &to_conv) {
204201
keep_rounding = false;
205202
char cur_digit = fraction_digits[digit_to_round];
206203
// if the digit should not be rounded up
207-
if (round == RoundDirection::Even && ((cur_digit - '0') % 2) == 0) {
204+
if (round == RoundDirection::Even &&
205+
(internal::b36_char_to_int(cur_digit) % 2) == 0) {
208206
// break out of the loop
209207
break;
210208
}
@@ -246,7 +244,7 @@ LIBC_INLINE int convert_fixed(Writer *writer, const FormatSection &to_conv) {
246244
char sign_char = 0;
247245

248246
// Check if the conv name is uppercase
249-
if (a == 'A') {
247+
if (internal::isupper(to_conv.conv_name)) {
250248
// These flags are only for signed conversions, so this removes them if the
251249
// conversion is unsigned.
252250
flags = FormatFlags(flags &

libc/src/stdio/printf_core/float_dec_converter.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "src/__support/FPUtil/FPBits.h"
1414
#include "src/__support/FPUtil/rounding_mode.h"
1515
#include "src/__support/big_int.h" // is_big_int_v
16+
#include "src/__support/ctype_utils.h"
1617
#include "src/__support/float_to_string.h"
1718
#include "src/__support/integer_to_string.h"
1819
#include "src/__support/libc_assert.h"
@@ -587,8 +588,6 @@ LIBC_INLINE int convert_float_dec_exp_typed(Writer *writer,
587588
int exponent = float_bits.get_explicit_exponent();
588589
StorageType mantissa = float_bits.get_explicit_mantissa();
589590

590-
const char a = (to_conv.conv_name & 32) | 'A';
591-
592591
char sign_char = 0;
593592

594593
if (float_bits.is_neg())
@@ -734,7 +733,8 @@ LIBC_INLINE int convert_float_dec_exp_typed(Writer *writer,
734733
round = get_round_direction(last_digit, truncated, float_bits.sign());
735734

736735
RET_IF_RESULT_NEGATIVE(float_writer.write_last_block(
737-
digits, maximum, round, final_exponent, a + 'E' - 'A'));
736+
digits, maximum, round, final_exponent,
737+
internal::islower(to_conv.conv_name) ? 'e' : 'E'));
738738

739739
RET_IF_RESULT_NEGATIVE(float_writer.right_pad());
740740
return WRITE_OK;

libc/src/stdio/printf_core/float_hex_converter.h

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "src/__support/CPP/string_view.h"
1313
#include "src/__support/FPUtil/FPBits.h"
1414
#include "src/__support/FPUtil/rounding_mode.h"
15+
#include "src/__support/ctype_utils.h"
1516
#include "src/__support/macros/config.h"
1617
#include "src/stdio/printf_core/converter_utils.h"
1718
#include "src/stdio/printf_core/core_structs.h"
@@ -28,10 +29,6 @@ LIBC_INLINE int convert_float_hex_exp(Writer *writer,
2829
const FormatSection &to_conv) {
2930
using LDBits = fputil::FPBits<long double>;
3031
using StorageType = LDBits::StorageType;
31-
// All of the letters will be defined relative to variable a, which will be
32-
// the appropriate case based on the name of the conversion. This converts any
33-
// conversion name into the letter 'a' with the appropriate case.
34-
const char a = (to_conv.conv_name & 32) | 'A';
3532

3633
bool is_negative;
3734
int exponent;
@@ -138,9 +135,10 @@ LIBC_INLINE int convert_float_hex_exp(Writer *writer,
138135
size_t mant_cur = mant_len;
139136
size_t first_non_zero = 1;
140137
for (; mant_cur > 0; --mant_cur, mantissa >>= 4) {
141-
char mant_mod_16 = static_cast<char>(mantissa) & 15;
142-
char new_digit = static_cast<char>(
143-
(mant_mod_16 > 9) ? (mant_mod_16 - 10 + a) : (mant_mod_16 + '0'));
138+
char mant_mod_16 = static_cast<char>(mantissa % 16);
139+
char new_digit = static_cast<char>(internal::int_to_b36_char(mant_mod_16));
140+
if (internal::isupper(to_conv.conv_name))
141+
new_digit = static_cast<char>(internal::toupper(new_digit));
144142
mant_buffer[mant_cur - 1] = new_digit;
145143
if (new_digit != '0' && first_non_zero < mant_cur)
146144
first_non_zero = mant_cur;
@@ -168,7 +166,8 @@ LIBC_INLINE int convert_float_hex_exp(Writer *writer,
168166

169167
size_t exp_cur = EXP_LEN;
170168
for (; exponent > 0; --exp_cur, exponent /= 10) {
171-
exp_buffer[exp_cur - 1] = static_cast<char>((exponent % 10) + '0');
169+
exp_buffer[exp_cur - 1] =
170+
static_cast<char>(internal::int_to_b36_char(exponent % 10));
172171
}
173172
if (exp_cur == EXP_LEN) { // if nothing else was written, write a 0.
174173
exp_buffer[EXP_LEN - 1] = '0';
@@ -187,7 +186,7 @@ LIBC_INLINE int convert_float_hex_exp(Writer *writer,
187186
constexpr size_t PREFIX_LEN = 2;
188187
char prefix[PREFIX_LEN];
189188
prefix[0] = '0';
190-
prefix[1] = a + ('x' - 'a');
189+
prefix[1] = internal::islower(to_conv.conv_name) ? 'x' : 'X';
191190
const cpp::string_view prefix_str(prefix, PREFIX_LEN);
192191

193192
// If the precision is greater than the actual result, pad with 0s
@@ -200,7 +199,7 @@ LIBC_INLINE int convert_float_hex_exp(Writer *writer,
200199
constexpr cpp::string_view HEXADECIMAL_POINT(".");
201200

202201
// This is for the letter 'p' before the exponent.
203-
const char exp_separator = a + ('p' - 'a');
202+
const char exp_separator = internal::islower(to_conv.conv_name) ? 'p' : 'P';
204203
constexpr int EXP_SEPARATOR_LEN = 1;
205204

206205
padding = static_cast<int>(to_conv.min_width - (sign_char > 0 ? 1 : 0) -

libc/src/stdio/printf_core/float_inf_nan_converter.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FLOAT_INF_NAN_CONVERTER_H
1111

1212
#include "src/__support/FPUtil/FPBits.h"
13+
#include "src/__support/ctype_utils.h"
1314
#include "src/__support/macros/config.h"
1415
#include "src/stdio/printf_core/converter_utils.h"
1516
#include "src/stdio/printf_core/core_structs.h"
@@ -26,8 +27,6 @@ using StorageType = fputil::FPBits<long double>::StorageType;
2627
LIBC_INLINE int convert_inf_nan(Writer *writer, const FormatSection &to_conv) {
2728
// All of the letters will be defined relative to variable a, which will be
2829
// the appropriate case based on the case of the conversion.
29-
const char a = (to_conv.conv_name & 32) | 'A';
30-
3130
bool is_negative;
3231
StorageType mantissa;
3332
if (to_conv.length_modifier == LengthModifier::L) {
@@ -66,9 +65,11 @@ LIBC_INLINE int convert_inf_nan(Writer *writer, const FormatSection &to_conv) {
6665
if (sign_char)
6766
RET_IF_RESULT_NEGATIVE(writer->write(sign_char));
6867
if (mantissa == 0) { // inf
69-
RET_IF_RESULT_NEGATIVE(writer->write(a == 'a' ? "inf" : "INF"));
68+
RET_IF_RESULT_NEGATIVE(
69+
writer->write(internal::islower(to_conv.conv_name) ? "inf" : "INF"));
7070
} else { // nan
71-
RET_IF_RESULT_NEGATIVE(writer->write(a == 'a' ? "nan" : "NAN"));
71+
RET_IF_RESULT_NEGATIVE(
72+
writer->write(internal::islower(to_conv.conv_name) ? "nan" : "NAN"));
7273
}
7374

7475
if (padding > 0 && ((to_conv.flags & FormatFlags::LEFT_JUSTIFIED) ==

0 commit comments

Comments
 (0)