Skip to content

Commit 554abfa

Browse files
committed
Merge remote-tracking branch 'upstream/master' into series_rolling_count_ignores_min_periods
2 parents df2a3e9 + 4e9ee4d commit 554abfa

File tree

23 files changed

+512
-102
lines changed

23 files changed

+512
-102
lines changed

doc/source/whatsnew/v1.0.0.rst

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,25 @@ Use :meth:`arrays.IntegerArray.to_numpy` with an explicit ``na_value`` instead.
483483
484484
a.to_numpy(dtype="float", na_value=np.nan)
485485
486+
**Reductions can return ``pd.NA``**
487+
488+
When performing a reduction such as a sum with ``skipna=False``, the result
489+
will now be ``pd.NA`` instead of ``np.nan`` in presence of missing values
490+
(:issue:`30958`).
491+
492+
*pandas 0.25.x*
493+
494+
.. code-block:: python
495+
496+
>>> pd.Series(a).sum(skipna=False)
497+
nan
498+
499+
*pandas 1.0.0*
500+
501+
.. ipython:: python
502+
503+
pd.Series(a).sum(skipna=False)
504+
486505
**value_counts returns a nullable integer dtype**
487506

488507
:meth:`Series.value_counts` with a nullable integer dtype now returns a nullable

doc/source/whatsnew/v1.1.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,8 @@ Reshaping
140140

141141
-
142142
- Bug in :meth:`DataFrame.pivot_table` when only MultiIndexed columns is set (:issue:`17038`)
143+
- Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`)
144+
143145

144146
Sparse
145147
^^^^^^

pandas/_libs/src/ujson/python/objToJSON.c

Lines changed: 76 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -456,8 +456,8 @@ static char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base,
456456
static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc,
457457
size_t *len) {
458458

459-
if (!PyDateTime_Check(obj)) {
460-
PyErr_SetString(PyExc_TypeError, "Expected datetime object");
459+
if (!PyDate_Check(obj)) {
460+
PyErr_SetString(PyExc_TypeError, "Expected date object");
461461
return NULL;
462462
}
463463

@@ -469,7 +469,7 @@ static npy_datetime PyDateTimeToEpoch(PyObject *obj, NPY_DATETIMEUNIT base) {
469469
npy_datetimestruct dts;
470470
int ret;
471471

472-
if (!PyDateTime_Check(obj)) {
472+
if (!PyDate_Check(obj)) {
473473
// TODO: raise TypeError
474474
}
475475
PyDateTime_Date *dt = (PyDateTime_Date *)obj;
@@ -1504,6 +1504,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
15041504
char **ret;
15051505
char *dataptr, *cLabel;
15061506
int type_num;
1507+
NPY_DATETIMEUNIT base = enc->datetimeUnit;
15071508
PRINTMARK();
15081509

15091510
if (!labels) {
@@ -1541,60 +1542,85 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
15411542
break;
15421543
}
15431544

1544-
// TODO: vectorized timedelta solution
1545-
if (enc->datetimeIso &&
1546-
(type_num == NPY_TIMEDELTA || PyDelta_Check(item))) {
1547-
PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item);
1548-
if (td == NULL) {
1549-
Py_DECREF(item);
1550-
NpyArr_freeLabels(ret, num);
1551-
ret = 0;
1552-
break;
1553-
}
1554-
1555-
PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL);
1556-
Py_DECREF(td);
1557-
if (iso == NULL) {
1558-
Py_DECREF(item);
1559-
NpyArr_freeLabels(ret, num);
1560-
ret = 0;
1561-
break;
1562-
}
1563-
1564-
cLabel = (char *)PyUnicode_AsUTF8(iso);
1565-
Py_DECREF(iso);
1566-
len = strlen(cLabel);
1567-
} else if (PyTypeNum_ISDATETIME(type_num)) {
1568-
NPY_DATETIMEUNIT base = enc->datetimeUnit;
1569-
npy_int64 longVal;
1545+
int is_datetimelike = 0;
1546+
npy_int64 nanosecVal;
1547+
if (PyTypeNum_ISDATETIME(type_num)) {
1548+
is_datetimelike = 1;
15701549
PyArray_VectorUnaryFunc *castfunc =
15711550
PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64);
15721551
if (!castfunc) {
15731552
PyErr_Format(PyExc_ValueError,
15741553
"Cannot cast numpy dtype %d to long",
15751554
enc->npyType);
15761555
}
1577-
castfunc(dataptr, &longVal, 1, NULL, NULL);
1578-
if (enc->datetimeIso) {
1579-
cLabel = int64ToIso(longVal, base, &len);
1556+
castfunc(dataptr, &nanosecVal, 1, NULL, NULL);
1557+
} else if (PyDate_Check(item) || PyDelta_Check(item)) {
1558+
is_datetimelike = 1;
1559+
if (PyObject_HasAttrString(item, "value")) {
1560+
nanosecVal = get_long_attr(item, "value");
15801561
} else {
1581-
if (!scaleNanosecToUnit(&longVal, base)) {
1582-
// TODO: This gets hit but somehow doesn't cause errors
1583-
// need to clean up (elsewhere in module as well)
1562+
if (PyDelta_Check(item)) {
1563+
nanosecVal = total_seconds(item) *
1564+
1000000000LL; // nanoseconds per second
1565+
} else {
1566+
// datetime.* objects don't follow above rules
1567+
nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns);
15841568
}
1585-
cLabel = PyObject_Malloc(21); // 21 chars for int64
1586-
sprintf(cLabel, "%" NPY_INT64_FMT, longVal);
1587-
len = strlen(cLabel);
15881569
}
1589-
} else if (PyDateTime_Check(item) || PyDate_Check(item)) {
1590-
NPY_DATETIMEUNIT base = enc->datetimeUnit;
1591-
if (enc->datetimeIso) {
1592-
cLabel = PyDateTimeToIso((PyDateTime_Date *)item, base, &len);
1570+
}
1571+
1572+
if (is_datetimelike) {
1573+
if (nanosecVal == get_nat()) {
1574+
len = 5; // TODO: shouldn't require extra space for terminator
1575+
cLabel = PyObject_Malloc(len);
1576+
strncpy(cLabel, "null", len);
15931577
} else {
1594-
cLabel = PyObject_Malloc(21); // 21 chars for int64
1595-
sprintf(cLabel, "%" NPY_DATETIME_FMT,
1596-
PyDateTimeToEpoch(item, base));
1597-
len = strlen(cLabel);
1578+
if (enc->datetimeIso) {
1579+
// TODO: Vectorized Timedelta function
1580+
if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) {
1581+
PyObject *td =
1582+
PyObject_CallFunction(cls_timedelta, "(O)", item);
1583+
if (td == NULL) {
1584+
Py_DECREF(item);
1585+
NpyArr_freeLabels(ret, num);
1586+
ret = 0;
1587+
break;
1588+
}
1589+
1590+
PyObject *iso =
1591+
PyObject_CallMethod(td, "isoformat", NULL);
1592+
Py_DECREF(td);
1593+
if (iso == NULL) {
1594+
Py_DECREF(item);
1595+
NpyArr_freeLabels(ret, num);
1596+
ret = 0;
1597+
break;
1598+
}
1599+
1600+
len = strlen(PyUnicode_AsUTF8(iso));
1601+
cLabel = PyObject_Malloc(len + 1);
1602+
memcpy(cLabel, PyUnicode_AsUTF8(iso), len + 1);
1603+
Py_DECREF(iso);
1604+
} else {
1605+
if (type_num == NPY_DATETIME) {
1606+
cLabel = int64ToIso(nanosecVal, base, &len);
1607+
} else {
1608+
cLabel = PyDateTimeToIso((PyDateTime_Date *)item,
1609+
base, &len);
1610+
}
1611+
}
1612+
if (cLabel == NULL) {
1613+
Py_DECREF(item);
1614+
NpyArr_freeLabels(ret, num);
1615+
ret = 0;
1616+
break;
1617+
}
1618+
} else {
1619+
cLabel = PyObject_Malloc(21); // 21 chars for int64
1620+
sprintf(cLabel, "%" NPY_DATETIME_FMT,
1621+
NpyDateTimeToEpoch(nanosecVal, base));
1622+
len = strlen(cLabel);
1623+
}
15981624
}
15991625
} else { // Fallback to string representation
16001626
PyObject *str = PyObject_Str(item);
@@ -1615,6 +1641,10 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
16151641
ret[i] = PyObject_Malloc(len + 1);
16161642
memcpy(ret[i], cLabel, len + 1);
16171643

1644+
if (is_datetimelike) {
1645+
PyObject_Free(cLabel);
1646+
}
1647+
16181648
if (PyErr_Occurred()) {
16191649
NpyArr_freeLabels(ret, num);
16201650
ret = 0;

pandas/core/arrays/boolean.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -670,13 +670,15 @@ def _reduce(self, name, skipna=True, **kwargs):
670670
mask = self._mask
671671

672672
# coerce to a nan-aware float if needed
673-
if mask.any():
674-
data = self._data.astype("float64")
675-
data[mask] = np.nan
673+
if self._hasna:
674+
data = self.to_numpy("float64", na_value=np.nan)
676675

677676
op = getattr(nanops, "nan" + name)
678677
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
679678

679+
if np.isnan(result):
680+
return libmissing.NA
681+
680682
# if we have numeric op that would result in an int, coerce to int if possible
681683
if name in ["sum", "prod"] and notna(result):
682684
int_result = np.int64(result)

pandas/core/arrays/integer.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
is_scalar,
2222
)
2323
from pandas.core.dtypes.dtypes import register_extension_dtype
24-
from pandas.core.dtypes.missing import isna, notna
24+
from pandas.core.dtypes.missing import isna
2525

2626
from pandas.core import nanops, ops
2727
from pandas.core.ops import invalid_comparison
@@ -549,21 +549,23 @@ def _reduce(self, name, skipna=True, **kwargs):
549549
mask = self._mask
550550

551551
# coerce to a nan-aware float if needed
552-
if mask.any():
553-
data = self._data.astype("float64")
554-
# We explicitly use NaN within reductions.
555-
data[mask] = np.nan
552+
# (we explicitly use NaN within reductions)
553+
if self._hasna:
554+
data = self.to_numpy("float64", na_value=np.nan)
556555

557556
op = getattr(nanops, "nan" + name)
558557
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
559558

559+
if np.isnan(result):
560+
return libmissing.NA
561+
560562
# if we have a boolean op, don't coerce
561563
if name in ["any", "all"]:
562564
pass
563565

564566
# if we have a preservable numeric op,
565567
# provide coercion back to an integer type if possible
566-
elif name in ["sum", "min", "max", "prod"] and notna(result):
568+
elif name in ["sum", "min", "max", "prod"]:
567569
int_result = int(result)
568570
if int_result == result:
569571
result = int_result

pandas/core/arrays/sparse/scipy_sparse.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@ def _check_is_partition(parts, whole):
1717

1818

1919
def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False):
20-
""" For arbitrary (MultiIndexed) SparseSeries return
20+
""" For arbitrary (MultiIndexed) sparse Series return
2121
(v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for
2222
passing to scipy.sparse.coo constructor. """
2323
# index and column levels must be a partition of the index
2424
_check_is_partition([row_levels, column_levels], range(ss.index.nlevels))
2525

26-
# from the SparseSeries: get the labels and data for non-null entries
27-
values = ss._data.internal_values()._valid_sp_values
26+
# from the sparse Series: get the labels and data for non-null entries
27+
values = ss.array._valid_sp_values
2828

2929
nonnull_labels = ss.dropna()
3030

@@ -85,7 +85,7 @@ def _get_index_subset_to_coord_dict(index, subset, sort_labels=False):
8585

8686
def _sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False):
8787
"""
88-
Convert a SparseSeries to a scipy.sparse.coo_matrix using index
88+
Convert a sparse Series to a scipy.sparse.coo_matrix using index
8989
levels row_levels, column_levels as the row and column
9090
labels respectively. Returns the sparse_matrix, row and column labels.
9191
"""

0 commit comments

Comments
 (0)