Skip to content

[FIX] Handle decimal and thousand separator in 'round_trip' converer #35377

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Aug 11, 2020
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Simplify str_copy_decimal_str.
  • Loading branch information
ales-erjavec committed Aug 10, 2020
commit dcd8de1c6ff9587b5f55d723010841b88cf70a4b
127 changes: 24 additions & 103 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1778,128 +1778,49 @@ double precise_xstrtod(const char *str, char **endptr, char decimal,
return number;
}

/* copy a decimal number string in form `decimal` and `tsep` and `sci` as
decimal point, thousands separator and sci exponent character to a an
equivalent c-locale decimal string (striping tsep, replacing `decimal`
with '.'). Return NULL if nothing could be copied.
/* copy a decimal number string with `decimal`, `tsep` as decimal point
and thousands separator to an equivalent c-locale decimal string (striping
`tsep`, replacing `decimal` with '.'). The returned memory should be free-d
with a call to `free`.
*/

char* str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
char tsep, char sci) {
#define IS_TSEP(c) (tsep != '\0' && c == tsep)
ssize_t size = 0;
ssize_t num_digits = 0;
char has_exponent = 0;
char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal,
char tsep) {
const char *p = s;
// First count how many characters we can consume.
// Leading sign
if (*p == '+' || *p == '-') p++;
// Integer part
while (isdigit_ascii(*p)) {
p++;
p += IS_TSEP(*p);
num_digits++;
}
// Fractional part
if (*p == decimal) {
p++;
while (isdigit_ascii(*p)) {
p++;
num_digits++;
}
}
if (num_digits == 0) {
if (endpos != NULL) {
*endpos = (char *)s;
}
return NULL;
}
// Exponent part
if (toupper_ascii(*p) == toupper_ascii(sci)) {
const char * p_at_e = p;
num_digits = 0;
p++;
// Exponent sign
if (*p == '+' || *p == '-') p++;
// Exponent
while (isdigit_ascii(*p)) {
p++;
num_digits++;
}
if (num_digits == 0) {
// no digits after exponent; un-consume the (+|-)?
p = p_at_e;
has_exponent = 0;
} else {
has_exponent = 1;
}
}

size = p - s;
char *pc = malloc(size + 1);
memcpy(pc, p, size);
pc[size] = '\0';
char *dst = pc;
p = s;
num_digits = 0;
// Copy leading sign
size_t length = strlen(s);
char *s_copy = malloc(length + 1);
char *dst = s_copy;
// Copy Leading sign
if (*p == '+' || *p == '-') {
*dst++ = *p++;
}
// Copy integer part
// Copy integer part dropping `tsep`
while (isdigit_ascii(*p)) {
*dst++ = *p++;
p += IS_TSEP(*p);
num_digits++;
p += (tsep != '\0' && *p == tsep);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the null byte check here serve a purpose?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. tsep is null when no thousands separator is defined. But the loop must not skip the terminating null byte in the input string (the second condition) which would happen on e.g. '123\0'

}
// Copy factional part, replacing `decimal` with '.'
// Replace `decimal` with '.'
if (*p == decimal) {
*dst++ = '.';
p++;
while (isdigit_ascii(*p)) {
*dst++ = *p++;
num_digits++;
}
}
assert(num_digits > 0);
// Copy exponent
if (has_exponent && toupper_ascii(*p) == toupper_ascii(sci)) {
num_digits = 0;
*dst++ = *p++;
// Copy leading exponent sign
if (*p == '+' || *p == '-') {
*dst++ = *p++;
}
// Exponent
while (isdigit_ascii(*p)) {
*dst++ = *p++;
num_digits++;
}
assert(num_digits > 0);
}
*dst = '\0';
if (endpos != NULL) {
*endpos = (char *)p;
}
return pc;
#undef IS_TSEP
*dst++ = '.';
p++;
}
// Copy the remainder of the string as is.
memcpy(dst, p, length + 1 - (p - s));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you use strncpy here instead? Would make this slightly more readable and then you won't need to manually add a null byte

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

s_copy[length] = '\0';
if (endpos != NULL)
*endpos = (char *)(s + length);
return s_copy;
}


double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
int skip_trailing, int *error, int *maybe_int) {
char *pc = NULL;
// 'normalize' representation to C-locale; replace decimal with '.' and
// remove t(housand)sep.
char *endptr = NULL;
if (decimal != '.' || tsep != '\0') {
pc = str_copy_decimal_str_c(p, &endptr, decimal, tsep, sci);
if (pc == NULL) {
if (q != NULL) {
*q = (char *)p;
}
*error = -1;
return 0.0;
}
pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep);
}
// This is called from a nogil block in parsers.pyx
// so need to explicitly get GIL before Python calls
Expand Down