Skip to content

Commit 04f0ef4

Browse files
authored
Merge branch 'pandas-dev:main' into main
2 parents e89a969 + 4eef5f6 commit 04f0ef4

File tree

25 files changed

+236
-46
lines changed

25 files changed

+236
-46
lines changed

.github/workflows/unit-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ jobs:
7171
# It will be temporarily activated during tests with locale.setlocale
7272
extra_loc: "zh_CN"
7373
platform: ubuntu-24.04
74-
- name: "Past no infer strings"
74+
- name: "PANDAS_FUTURE_INFER_STRING=0"
7575
env_file: actions-312.yaml
7676
pandas_future_infer_string: "0"
7777
platform: ubuntu-24.04

doc/source/user_guide/indexing.rst

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1732,3 +1732,49 @@ Why does assignment fail when using chained indexing?
17321732
This means that chained indexing will never work.
17331733
See :ref:`this section <copy_on_write_chained_assignment>`
17341734
for more context.
1735+
1736+
.. _indexing.series_assignment:
1737+
1738+
Series Assignment and Index Alignment
1739+
-------------------------------------
1740+
1741+
When assigning a Series to a DataFrame column, pandas performs automatic alignment
1742+
based on index labels. This is a fundamental behavior that can be surprising to
1743+
new users who might expect positional assignment.
1744+
1745+
Key Points:
1746+
~~~~~~~~~~~
1747+
1748+
* Series values are matched to DataFrame rows by index label
1749+
* Position/order in the Series doesn't matter
1750+
* Missing index labels result in NaN values
1751+
* This behavior is consistent across df[col] = series and df.loc[:, col] = series
1752+
1753+
Examples:
1754+
.. ipython:: python
1755+
1756+
import pandas as pd
1757+
1758+
# Create a DataFrame
1759+
df = pd.DataFrame({'values': [1, 2, 3]}, index=['x', 'y', 'z'])
1760+
1761+
# Series with matching indices (different order)
1762+
s1 = pd.Series([10, 20, 30], index=['z', 'x', 'y'])
1763+
df['aligned'] = s1 # Aligns by index, not position
1764+
print(df)
1765+
1766+
# Series with partial index match
1767+
s2 = pd.Series([100, 200], index=['x', 'z'])
1768+
df['partial'] = s2 # Missing 'y' gets NaN
1769+
print(df)
1770+
1771+
# Series with non-matching indices
1772+
s3 = pd.Series([1000, 2000], index=['a', 'b'])
1773+
df['nomatch'] = s3 # All values become NaN
1774+
print(df)
1775+
1776+
1777+
#Avoiding Confusion:
1778+
#If you want positional assignment instead of index alignment:
1779+
# reset the Series index to match DataFrame index
1780+
df['s1_values'] = s1.reindex(df.index)

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -687,6 +687,7 @@ Bug fixes
687687
Categorical
688688
^^^^^^^^^^^
689689
- Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`)
690+
- Bug in :meth:`Categorical.astype` where ``copy=False`` would still trigger a copy of the codes (:issue:`62000`)
690691
- Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`)
691692
- Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`)
692693
-

pandas/conftest.py

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -176,25 +176,19 @@ def pytest_collection_modifyitems(items, config) -> None:
176176
ignore_doctest_warning(item, path, message)
177177

178178

179-
hypothesis_health_checks = [
180-
hypothesis.HealthCheck.too_slow,
181-
hypothesis.HealthCheck.differing_executors,
182-
]
183-
184-
# Hypothesis
179+
# Similar to "ci" config in
180+
# https://hypothesis.readthedocs.io/en/latest/reference/api.html#built-in-profiles
185181
hypothesis.settings.register_profile(
186-
"ci",
187-
# Hypothesis timing checks are tuned for scalars by default, so we bump
188-
# them from 200ms to 500ms per test case as the global default. If this
189-
# is too short for a specific test, (a) try to make it faster, and (b)
190-
# if it really is slow add `@settings(deadline=...)` with a working value,
191-
# or `deadline=None` to entirely disable timeouts for that test.
192-
# 2022-02-09: Changed deadline from 500 -> None. Deadline leads to
193-
# non-actionable, flaky CI failures (# GH 24641, 44969, 45118, 44969)
182+
"pandas_ci",
183+
database=None,
194184
deadline=None,
195-
suppress_health_check=tuple(hypothesis_health_checks),
185+
max_examples=15,
186+
suppress_health_check=(
187+
hypothesis.HealthCheck.too_slow,
188+
hypothesis.HealthCheck.differing_executors,
189+
),
196190
)
197-
hypothesis.settings.load_profile("ci")
191+
hypothesis.settings.load_profile("pandas_ci")
198192

199193
# Registering these strategies makes them globally available via st.from_type,
200194
# which is use for offsets in tests/tseries/offsets/test_offsets_properties.py

pandas/core/arrays/categorical.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -575,7 +575,7 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
575575
# GH 10696/18593/18630
576576
dtype = self.dtype.update_dtype(dtype)
577577
self = self.copy() if copy else self
578-
result = self._set_dtype(dtype)
578+
result = self._set_dtype(dtype, copy=False)
579579

580580
elif isinstance(dtype, ExtensionDtype):
581581
return super().astype(dtype, copy=copy)
@@ -945,7 +945,7 @@ def _set_categories(self, categories, fastpath: bool = False) -> None:
945945

946946
super().__init__(self._ndarray, new_dtype)
947947

948-
def _set_dtype(self, dtype: CategoricalDtype) -> Self:
948+
def _set_dtype(self, dtype: CategoricalDtype, copy: bool = True) -> Self:
949949
"""
950950
Internal method for directly updating the CategoricalDtype
951951
@@ -958,7 +958,9 @@ def _set_dtype(self, dtype: CategoricalDtype) -> Self:
958958
We don't do any validation here. It's assumed that the dtype is
959959
a (valid) instance of `CategoricalDtype`.
960960
"""
961-
codes = recode_for_categories(self.codes, self.categories, dtype.categories)
961+
codes = recode_for_categories(
962+
self.codes, self.categories, dtype.categories, copy
963+
)
962964
return type(self)._simple_new(codes, dtype=dtype)
963965

964966
def set_ordered(self, value: bool) -> Self:

pandas/core/frame.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4213,6 +4213,89 @@ def isetitem(self, loc, value) -> None:
42134213
self._iset_item_mgr(loc, arraylike, inplace=False, refs=refs)
42144214

42154215
def __setitem__(self, key, value) -> None:
4216+
"""
4217+
Set item(s) in DataFrame by key.
4218+
4219+
This method allows you to set the values of one or more columns in the
4220+
DataFrame using a key. If the key does not exist, a new
4221+
column will be created.
4222+
4223+
Parameters
4224+
----------
4225+
key : The object(s) in the index which are to be assigned to
4226+
Column label(s) to set. Can be a single column name, list of column names,
4227+
or tuple for MultiIndex columns.
4228+
value : scalar, array-like, Series, or DataFrame
4229+
Value(s) to set for the specified key(s).
4230+
4231+
Returns
4232+
-------
4233+
None
4234+
This method does not return a value.
4235+
4236+
See Also
4237+
--------
4238+
DataFrame.loc : Access and set values by label-based indexing.
4239+
DataFrame.iloc : Access and set values by position-based indexing.
4240+
DataFrame.assign : Assign new columns to a DataFrame.
4241+
4242+
Notes
4243+
-----
4244+
When assigning a Series to a DataFrame column, pandas aligns the Series
4245+
by index labels, not by position. This means:
4246+
4247+
* Values from the Series are matched to DataFrame rows by index label
4248+
* If a Series index label doesn't exist in the DataFrame index, it's ignored
4249+
* If a DataFrame index label doesn't exist in the Series index, NaN is assigned
4250+
* The order of values in the Series doesn't matter; only the index labels matter
4251+
4252+
Examples
4253+
--------
4254+
Basic column assignment:
4255+
4256+
>>> df = pd.DataFrame({"A": [1, 2, 3]})
4257+
>>> df["B"] = [4, 5, 6] # Assigns by position
4258+
>>> df
4259+
A B
4260+
0 1 4
4261+
1 2 5
4262+
2 3 6
4263+
4264+
Series assignment with index alignment:
4265+
4266+
>>> df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 2])
4267+
>>> s = pd.Series([10, 20], index=[1, 3]) # Note: index 3 doesn't exist in df
4268+
>>> df["B"] = s # Assigns by index label, not position
4269+
>>> df
4270+
A B
4271+
0 1 NaN
4272+
1 2 10
4273+
2 3 NaN
4274+
4275+
Series assignment with partial index match:
4276+
4277+
>>> df = pd.DataFrame({"A": [1, 2, 3, 4]}, index=["a", "b", "c", "d"])
4278+
>>> s = pd.Series([100, 200], index=["b", "d"])
4279+
>>> df["B"] = s
4280+
>>> df
4281+
A B
4282+
a 1 NaN
4283+
b 2 100
4284+
c 3 NaN
4285+
d 4 200
4286+
4287+
Series index labels NOT in DataFrame, ignored:
4288+
4289+
>>> df = pd.DataFrame({"A": [1, 2, 3]}, index=["x", "y", "z"])
4290+
>>> s = pd.Series([10, 20, 30, 40, 50], index=["x", "y", "a", "b", "z"])
4291+
>>> df["B"] = s
4292+
>>> df
4293+
A B
4294+
x 1 10
4295+
y 2 20
4296+
z 3 50
4297+
# Values for 'a' and 'b' are completely ignored!
4298+
"""
42164299
if not PYPY:
42174300
if sys.getrefcount(self) <= 3:
42184301
warnings.warn(

pandas/core/indexing.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,22 @@ def loc(self) -> _LocIndexer:
609609
610610
Please see the :ref:`user guide<advanced.advanced_hierarchical>`
611611
for more details and explanations of advanced indexing.
612+
613+
**Assignment with Series**
614+
615+
When assigning a Series to .loc[row_indexer, col_indexer], pandas aligns
616+
the Series by index labels, not by order or position.
617+
618+
Series assignment with .loc and index alignment:
619+
620+
>>> df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 2])
621+
>>> s = pd.Series([10, 20], index=[1, 0]) # Note reversed order
622+
>>> df.loc[:, "B"] = s # Aligns by index, not order
623+
>>> df
624+
A B
625+
0 1 20.0
626+
1 2 10.0
627+
2 3 NaN
612628
"""
613629
return _LocIndexer("loc", self)
614630

pandas/core/series.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1972,7 +1972,7 @@ def groupby(
19721972
as_index: bool = True,
19731973
sort: bool = True,
19741974
group_keys: bool = True,
1975-
observed: bool = False,
1975+
observed: bool = True,
19761976
dropna: bool = True,
19771977
) -> SeriesGroupBy:
19781978
from pandas.core.groupby.generic import SeriesGroupBy

pandas/errors/cow.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,28 @@
11
_chained_assignment_msg = (
22
"A value is trying to be set on a copy of a DataFrame or Series "
33
"through chained assignment.\n"
4-
"When using the Copy-on-Write mode, such chained assignment never works "
5-
"to update the original DataFrame or Series, because the intermediate "
6-
"object on which we are setting values always behaves as a copy.\n\n"
4+
"Such chained assignment never works to update the original DataFrame or "
5+
"Series, because the intermediate object on which we are setting values "
6+
"always behaves as a copy (due to Copy-on-Write).\n\n"
77
"Try using '.loc[row_indexer, col_indexer] = value' instead, to perform "
88
"the assignment in a single step.\n\n"
9-
"See the caveats in the documentation: "
9+
"See the documentation for a more detailed explanation: "
1010
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
11-
"copy_on_write.html"
11+
"copy_on_write.html#chained-assignment"
1212
)
1313

1414

1515
_chained_assignment_method_msg = (
1616
"A value is trying to be set on a copy of a DataFrame or Series "
1717
"through chained assignment using an inplace method.\n"
18-
"When using the Copy-on-Write mode, such inplace method never works "
19-
"to update the original DataFrame or Series, because the intermediate "
20-
"object on which we are setting values always behaves as a copy.\n\n"
18+
"Such inplace method never works to update the original DataFrame or Series, "
19+
"because the intermediate object on which we are setting values always "
20+
"behaves as a copy (due to Copy-on-Write).\n\n"
2121
"For example, when doing 'df[col].method(value, inplace=True)', try "
2222
"using 'df.method({col: value}, inplace=True)' instead, to perform "
23-
"the operation inplace on the original object.\n\n"
23+
"the operation inplace on the original object, or try to avoid an inplace "
24+
"operation using 'df[col] = df[col].method(value)'.\n\n"
25+
"See the documentation for a more detailed explanation: "
26+
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
27+
"copy_on_write.html"
2428
)

pandas/tests/arrays/categorical/test_astype.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,14 @@ def test_astype_category(self, dtype_ordered, ordered):
130130
expected = cat
131131
tm.assert_categorical_equal(result, expected)
132132

133+
def test_astype_category_copy_false_nocopy_codes(self):
134+
# GH#62000
135+
cat = Categorical([3, 2, 4, 1])
136+
new = cat.astype("category", copy=False)
137+
assert tm.shares_memory(new.codes, cat.codes)
138+
new = cat.astype("category", copy=True)
139+
assert not tm.shares_memory(new.codes, cat.codes)
140+
133141
def test_astype_object_datetime_categories(self):
134142
# GH#40754
135143
cat = Categorical(to_datetime(["2021-03-27", NaT]))

0 commit comments

Comments
 (0)