Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Assorted UBSAN cleanups #55112

Merged
merged 5 commits into from
Sep 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion pandas/_libs/src/vendored/ujson/lib/ultrajsonenc.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ Numeric decoder derived from TCL library
#include <float.h>
#include <locale.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
Expand Down Expand Up @@ -763,7 +764,12 @@ void Buffer_AppendIntUnchecked(JSONObjectEncoder *enc, JSINT32 value) {

void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value) {
char *wstr;
JSUINT64 uvalue = (value < 0) ? -value : value;
JSUINT64 uvalue;
if (value == INT64_MIN) {
uvalue = INT64_MAX + UINT64_C(1);
} else {
uvalue = (value < 0) ? -value : value;
}

wstr = enc->offset;
// Conversion. Number is reversed.
Expand Down
49 changes: 34 additions & 15 deletions pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
cimport cython
from cpython.datetime cimport (
PyDateTime_CheckExact,
PyDateTime_DATE_GET_HOUR,
Expand All @@ -18,6 +17,7 @@ from cpython.object cimport (
Py_LT,
Py_NE,
)
from libc.stdint cimport INT64_MAX

import_datetime()
PandasDateTime_IMPORT
Expand Down Expand Up @@ -545,14 +545,14 @@ cdef ndarray astype_round_check(
return iresult


@cython.overflowcheck(True)
cdef int64_t get_conversion_factor(
NPY_DATETIMEUNIT from_unit,
NPY_DATETIMEUNIT to_unit
) except? -1:
"""
Find the factor by which we need to multiply to convert from from_unit to to_unit.
"""
cdef int64_t value, overflow_limit, factor
if (
from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
Expand All @@ -565,28 +565,44 @@ cdef int64_t get_conversion_factor(
return 1

if from_unit == NPY_DATETIMEUNIT.NPY_FR_W:
return 7 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit)
value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit)
factor = 7
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_D:
return 24 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit)
value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit)
factor = 24
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_h:
return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit)
value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit)
factor = 60
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_m:
return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit)
value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit)
factor = 60
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_s:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit)
value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit)
factor = 1000
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ms:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit)
value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit)
factor = 1000
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_us:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit)
value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit)
factor = 1000
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ns:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit)
value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit)
factor = 1000
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ps:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit)
value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit)
factor = 1000
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit)
value = get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit)
factor = 1000
else:
raise ValueError("Converting from M or Y units is not supported.")

overflow_limit = INT64_MAX // factor
if value > overflow_limit or value < -overflow_limit:
raise OverflowError("result would overflow")

return factor * value


cdef int64_t convert_reso(
int64_t value,
Expand All @@ -595,7 +611,7 @@ cdef int64_t convert_reso(
bint round_ok,
) except? -1:
cdef:
int64_t res_value, mult, div, mod
int64_t res_value, mult, div, mod, overflow_limit

if from_reso == to_reso:
return value
Expand Down Expand Up @@ -624,9 +640,12 @@ cdef int64_t convert_reso(
else:
# e.g. ns -> us, risk of overflow, but no risk of lossy rounding
mult = get_conversion_factor(from_reso, to_reso)
with cython.overflowcheck(True):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a cython bug?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it's a bug per se. I think Cython let's the overflow happen but then adds checks after the fact to see if it overflowed. This by contrast prevents the overflow from happening in the first place. It generally gets you to the same place in the end

Copy link
Member Author

@WillAyd WillAyd Sep 13, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like Cython generates something like this:

static CYTHON_INLINE int __Pyx_mul_const_int_checking_overflow(int a, int b, int *overflow) {
    if (b > 1) {
        *overflow |= a > __PYX_MAX(int) / b;
        *overflow |= a < __PYX_MIN(int) / b;
    } else if (b == -1) {
        *overflow |= a == __PYX_MIN(int);
    } else if (b < -1) {
        *overflow |= a > __PYX_MIN(int) / b;
        *overflow |= a < __PYX_MAX(int) / b;
    }
    return a * b;
}

We aren't handling a negative denominator, but otherwise yea the difference is Cython still does the multiplication and just sets an overflow variable if something overflows; we are not doing the multiplication at all in this

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for my edification, this pattern is considered Better Practice than the one cython uses?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the only difference here is that this will make the sanitizer happy whereas the cython approach will not

overflow_limit = INT64_MAX // mult
if value > overflow_limit or value < -overflow_limit:
# Note: caller is responsible for re-raising as OutOfBoundsTimedelta
res_value = value * mult
raise OverflowError("result would overflow")

res_value = value * mult

return res_value

Expand Down