Skip to content

Commit 2a74305

Browse files
committed
ENH: Allow SparseDataFrame/SparseSeries values assignment
1 parent 06813d6 commit 2a74305

File tree

7 files changed

+185
-63
lines changed

7 files changed

+185
-63
lines changed

pandas/core/indexing.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -526,7 +526,10 @@ def setter(item, v):
526526
# set the item, possibly having a dtype change
527527
s._consolidate_inplace()
528528
s = s.copy()
529-
s._data = s._data.setitem(indexer=pi, value=v)
529+
if is_sparse(s):
530+
s.set_value(pi, v, takeable=is_list_like(pi))
531+
else:
532+
s._data = s._data.setitem(indexer=pi, value=v)
530533
s._maybe_update_cacher(clear=True)
531534

532535
# reset the sliced object if unique
@@ -631,8 +634,13 @@ def can_do_equal_len():
631634

632635
# actually do the set
633636
self.obj._consolidate_inplace()
634-
self.obj._data = self.obj._data.setitem(indexer=indexer,
635-
value=value)
637+
if is_sparse(self.obj):
638+
# SparseSeries has underlying SparseArray, which doesn't
639+
# support resizing
640+
self.obj[indexer] = value
641+
else:
642+
self.obj._data = self.obj._data.setitem(indexer=indexer,
643+
value=value)
636644
self.obj._maybe_update_cacher(clear=True)
637645

638646
def _align_series(self, indexer, ser, multiindex_indexer=False):
@@ -1929,6 +1937,11 @@ def _has_valid_setitem_indexer(self, indexer):
19291937

19301938
def _convert_key(self, key, is_setter=False):
19311939
""" require integer args (and convert to label arguments) """
1940+
1941+
# allow arbitrary setting
1942+
if is_setter:
1943+
return list(key)
1944+
19321945
for a, i in zip(self.obj.axes, key):
19331946
if not is_integer(i):
19341947
raise ValueError("iAt based indexing can only have integer "

pandas/core/internals.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1760,12 +1760,13 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0,
17601760
# use block's copy logic.
17611761
# .values may be an Index which does shallow copy by default
17621762
new_values = self.values if inplace else self.copy().values
1763+
new_values = new_values.to_dense()
17631764
new_values, _, new, _ = self._try_coerce_args(new_values, new)
17641765

17651766
if isinstance(new, np.ndarray) and len(new) == len(mask):
17661767
new = new[mask]
17671768

1768-
mask = _safe_reshape(mask, new_values.shape)
1769+
mask = _safe_reshape(np.asarray(mask), new_values.shape)
17691770

17701771
new_values[mask] = new
17711772
new_values = self._try_coerce_result(new_values)
@@ -2804,6 +2805,12 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
28042805
return self.make_block_same_class(values=values,
28052806
placement=self.mgr_locs)
28062807

2808+
def _try_coerce_result(self, result):
2809+
if not is_sparse(result):
2810+
result = SparseArray(result, kind=self.kind,
2811+
fill_value=self.fill_value, dtype=self.dtype)
2812+
return result
2813+
28072814
def __len__(self):
28082815
try:
28092816
return self.sp_index.length

pandas/core/series.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -950,8 +950,7 @@ def set_value(self, label, value, takeable=False):
950950
Returns
951951
-------
952952
series : Series
953-
If label is contained, will be reference to calling Series,
954-
otherwise a new object
953+
self
955954
"""
956955
warnings.warn("set_value is deprecated and will be removed "
957956
"in a future release. Please use "

pandas/core/sparse/frame.py

Lines changed: 2 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -322,8 +322,8 @@ def _apply_columns(self, func):
322322
default_fill_value=self.default_fill_value,
323323
default_kind=self.default_kind).__finalize__(self)
324324

325-
def astype(self, dtype):
326-
return self._apply_columns(lambda x: x.astype(dtype))
325+
def astype(self, dtype, **kwargs):
326+
return self._apply_columns(lambda x: x.astype(dtype, **kwargs))
327327

328328
def copy(self, deep=True):
329329
"""
@@ -465,44 +465,6 @@ def _get_value(self, index, col, takeable=False):
465465
return series._get_value(index, takeable=takeable)
466466
_get_value.__doc__ = get_value.__doc__
467467

468-
def set_value(self, index, col, value, takeable=False):
469-
"""
470-
Put single value at passed column and index
471-
472-
.. deprecated:: 0.21.0
473-
474-
Please use .at[] or .iat[] accessors.
475-
476-
Parameters
477-
----------
478-
index : row label
479-
col : column label
480-
value : scalar value
481-
takeable : interpret the index/col as indexers, default False
482-
483-
Notes
484-
-----
485-
This method *always* returns a new object. It is currently not
486-
particularly efficient (and potentially very expensive) but is provided
487-
for API compatibility with DataFrame
488-
489-
Returns
490-
-------
491-
frame : DataFrame
492-
"""
493-
warnings.warn("set_value is deprecated and will be removed "
494-
"in a future release. Please use "
495-
".at[] or .iat[] accessors instead", FutureWarning,
496-
stacklevel=2)
497-
return self._set_value(index, col, value, takeable=takeable)
498-
499-
def _set_value(self, index, col, value, takeable=False):
500-
dense = self.to_dense()._set_value(
501-
index, col, value, takeable=takeable)
502-
return dense.to_sparse(kind=self._default_kind,
503-
fill_value=self._default_fill_value)
504-
_set_value.__doc__ = set_value.__doc__
505-
506468
def _slice(self, slobj, axis=0, kind=None):
507469
if axis == 0:
508470
new_index = self.index[slobj]

pandas/core/sparse/series.py

Lines changed: 103 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,20 @@
99
import warnings
1010

1111
from pandas.core.dtypes.missing import isna, notna
12-
from pandas.core.dtypes.common import is_scalar
12+
from pandas.core.dtypes.common import is_scalar, is_sparse
1313
from pandas.core.common import _values_from_object, _maybe_match_name
1414

1515
from pandas.compat.numpy import function as nv
1616
from pandas.core.index import Index, _ensure_index, InvalidIndexError
17+
from pandas.core.indexing import check_bool_indexer
1718
from pandas.core.series import Series
1819
from pandas.core.frame import DataFrame
1920
from pandas.core.internals import SingleBlockManager
2021
from pandas.core import generic
2122
import pandas.core.common as com
2223
import pandas.core.ops as ops
2324
import pandas._libs.index as _index
25+
from pandas.errors import PerformanceWarning
2426
from pandas.util._decorators import Appender
2527

2628
from pandas.core.sparse.array import (
@@ -315,8 +317,13 @@ def __array_wrap__(self, result, context=None):
315317
else:
316318
fill_value = self.fill_value
317319

320+
# Results size unchanged, old sparse index is valid ???
321+
if np.size(result) == self.sp_index.npoints:
322+
sp_index = self.sp_index
323+
else:
324+
sp_index = None
318325
return self._constructor(result, index=self.index,
319-
sparse_index=self.sp_index,
326+
sparse_index=sp_index,
320327
fill_value=fill_value,
321328
copy=False).__finalize__(self)
322329

@@ -427,7 +434,22 @@ def _get_values(self, indexer):
427434
return self[indexer]
428435

429436
def _set_with_engine(self, key, value):
437+
<<<<<<< HEAD
430438
return self._set_value(key, value)
439+
||||||| parent of e95270f1e... ENH: Allow SparseDataFrame/SparseSeries values assignment
440+
return self.set_value(key, value)
441+
=======
442+
takeable = False
443+
444+
# Sparse doesn't support reshaping so the standard .where() does
445+
# not apply. We short-circuit bool indexers here by treating them as
446+
# regular list of indexes and setting each array/value separately
447+
if com.is_bool_indexer(key):
448+
key = check_bool_indexer(self.index, key).nonzero()[0]
449+
takeable = True
450+
451+
return self.set_value(key, value, takeable=takeable)
452+
>>>>>>> e95270f1e... ENH: Allow SparseDataFrame/SparseSeries values assignment
431453

432454
def abs(self):
433455
"""
@@ -526,18 +548,62 @@ def set_value(self, label, value, takeable=False):
526548
return self._set_value(label, value, takeable=takeable)
527549

528550
def _set_value(self, label, value, takeable=False):
529-
values = self.to_dense()
530-
531-
# if the label doesn't exist, we will create a new object here
532-
# and possibily change the index
533-
new_values = values._set_value(label, value, takeable=takeable)
534-
if new_values is not None:
535-
values = new_values
536-
new_index = values.index
537-
values = SparseArray(values, fill_value=self.fill_value,
538-
kind=self.kind)
539-
self._data = SingleBlockManager(values, new_index)
540-
self._index = new_index
551+
try:
552+
loc = self.index.get_loc(label)
553+
except (KeyError, TypeError):
554+
loc = None
555+
556+
warnings.warn(
557+
'Setting SparseSeries values is inefficient '
558+
'(a copy of data is made)', PerformanceWarning, stacklevel=2)
559+
560+
# If label is not unique in index, or it is takeable,
561+
# amend the series by amending its dense copy
562+
if not isinstance(loc, int) or takeable:
563+
warnings.warn(
564+
'Setting SparseSeries values is particularly inefficient when '
565+
'indexing with a non-unique label because the whole series '
566+
'is made dense interim.', PerformanceWarning, stacklevel=2)
567+
values = self.to_dense()
568+
values.set_value(label, value, takeable=takeable)
569+
570+
index = values.index
571+
sp_index = None
572+
values = values.to_sparse(kind=self.kind,
573+
fill_value=self.fill_value)
574+
575+
# If label is unique key and not takeable, then it is more space-
576+
# efficient to not make the whole series dense, rather just its
577+
# defined part
578+
else:
579+
values = self._to_dense(sparse_only=True)
580+
old_index = values.index
581+
values.set_value(label, value, takeable=takeable)
582+
index = self.index
583+
584+
# label was already in sparse index, we can just reuse old index
585+
if label in old_index:
586+
sp_index = self.sp_index
587+
588+
# label might have been at least in .index
589+
else:
590+
# and if not, just add it, then construct both indexes anew
591+
if loc is None:
592+
index = self.index.append(Index((label,)))
593+
loc = len(index) - 1
594+
595+
indices = np.append(
596+
self.sp_index.to_int_index().indices,
597+
np.array(loc, dtype=np.int32))
598+
order = indices.argsort()
599+
values = values.values.take(order)
600+
indices = indices.take(order)
601+
sp_index = _make_index(len(index), indices, self.kind)
602+
603+
values = SparseArray(values, sparse_index=sp_index, kind=self.kind,
604+
fill_value=self.fill_value)
605+
self._data = SingleBlockManager(values, index)
606+
self._index = index
541607
_set_value.__doc__ = set_value.__doc__
542608

543609
def _set_values(self, key, value):
@@ -562,7 +628,8 @@ def to_dense(self, sparse_only=False):
562628
Parameters
563629
----------
564630
sparse_only: bool, default False
565-
DEPRECATED: this argument will be removed in a future version.
631+
.. deprecated:: 0.19.2
632+
This argument will be removed in a future version.
566633
567634
If True, return just the non-sparse values, or the dense version
568635
of `self.values` if False.
@@ -575,12 +642,31 @@ def to_dense(self, sparse_only=False):
575642
warnings.warn(("The 'sparse_only' parameter has been deprecated "
576643
"and will be removed in a future version."),
577644
FutureWarning, stacklevel=2)
645+
return self._to_dense(sparse_only=sparse_only)
646+
647+
def _to_dense(self, sparse_only=False):
648+
"""
649+
Convert SparseSeries to a Series.
650+
651+
Parameters
652+
----------
653+
sparse_only: bool, default False
654+
If True, return just the non-sparse values, or the dense version
655+
of `self.values` if False.
656+
657+
Returns
658+
-------
659+
s : Series
660+
"""
661+
if sparse_only:
578662
int_index = self.sp_index.to_int_index()
579663
index = self.index.take(int_index.indices)
580664
return Series(self.sp_values, index=index, name=self.name)
581665
else:
582-
return Series(self.values.to_dense(), index=self.index,
583-
name=self.name)
666+
values = self.values
667+
if is_sparse(values):
668+
values = values.to_dense()
669+
return Series(values, index=self.index, name=self.name)
584670

585671
@property
586672
def density(self):

pandas/tests/sparse/test_frame.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1534,3 +1534,33 @@ def test_quantile_multi(self):
15341534

15351535
tm.assert_frame_equal(result, dense_expected)
15361536
tm.assert_sp_frame_equal(result, sparse_expected)
1537+
1538+
1539+
@pytest.mark.parametrize('kind', ['integer', 'block'])
1540+
@pytest.mark.parametrize('indexer', [None, 'loc', 'iloc', 'at', 'iat'])
1541+
@pytest.mark.parametrize('key', [0, [0, 1], [True, False], None])
1542+
def test_frame_assignment(kind, indexer, key):
1543+
try_multiple = 'at' not in (indexer or '')
1544+
is_multi_key = np.asarray(key).ndim > 0
1545+
if is_multi_key and not try_multiple:
1546+
return
1547+
if not indexer and not is_multi_key and key is not None: # skip non-multikey with setitem
1548+
return
1549+
if indexer and key is None: # skip df indexer with non-setitem
1550+
return
1551+
1552+
arr = np.array([[1, nan],
1553+
[nan, 1]])
1554+
sdf = SparseDataFrame(arr, default_kind=kind).to_sparse(kind=kind)
1555+
1556+
if key is None:
1557+
key = pd.isnull(sdf).to_sparse()
1558+
1559+
arr = arr.copy()
1560+
arr[np.asarray(key)] = 2
1561+
res = SparseDataFrame(arr, default_kind=kind).to_sparse(kind=kind)
1562+
1563+
sdf_setitem = getattr(sdf, indexer) if indexer else sdf
1564+
sdf_setitem[key] = 2
1565+
1566+
tm.assert_sp_frame_equal(sdf, res)

pandas/tests/sparse/test_series.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1535,3 +1535,28 @@ def test_constructor_dict_datetime64_index(datetime_type):
15351535
expected = SparseSeries(values, map(pd.Timestamp, dates))
15361536

15371537
tm.assert_sp_series_equal(result, expected)
1538+
1539+
1540+
@pytest.mark.parametrize('kind', ['integer', 'block'])
1541+
@pytest.mark.parametrize('indexer', [None, 'loc', 'iloc', 'at', 'iat'])
1542+
@pytest.mark.parametrize('key', [0, [0, 1], 2, [2, 3],
1543+
[True, False, False, False],
1544+
[False, False, False, True],])
1545+
def test_series_assignment(kind, indexer, key):
1546+
try_multiple = 'at' not in (indexer or '')
1547+
is_multi_key = np.asarray(key).ndim > 0
1548+
if is_multi_key and not try_multiple:
1549+
return
1550+
1551+
arr = np.array([0., 0., nan, nan])
1552+
ss = SparseSeries(arr, kind=kind)
1553+
assert len(ss.sp_index.to_int_index().indices) == 2
1554+
1555+
res = arr.copy()
1556+
res[key] = 1
1557+
res = SparseSeries(res, kind=kind)
1558+
1559+
ss_setitem = getattr(ss, indexer) if indexer else ss
1560+
ss_setitem[key] = 1
1561+
1562+
tm.assert_sp_series_equal(ss, res)

0 commit comments

Comments
 (0)