Description
Pandas version checks
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
python -m pytest # ;-)
Issue Description
When running the test suite on 32-bit x86 with pyarrow
installed, I'm getting the following test failures (compared to a run without pyarrow
):
FAILED pandas/tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_arrow_dictionary_target - AssertionError: numpy array are different
FAILED pandas/tests/interchange/test_impl.py::test_large_string_pyarrow - OverflowError: Python int too large to convert to C ssize_t
FAILED pandas/tests/frame/methods/test_join.py::test_join_on_single_col_dup_on_right[string[pyarrow]] - ValueError: putmask: output array is read-only
FAILED pandas/tests/reshape/merge/test_multi.py::TestMergeMulti::test_left_join_multi_index[False-True] - ValueError: putmask: output array is read-only
Tracebacks
_______________________________________ TestGetIndexer.test_get_indexer_arrow_dictionary_target _______________________________________
[gw8] linux -- Python 3.11.7 /var/tmp/portage/dev-python/pandas-2.2.0-r1/work/pandas-2.2.0-python3_11/install/usr/bin/python3.11
self = <pandas.tests.indexes.numeric.test_indexing.TestGetIndexer object at 0xd9180110>
def test_get_indexer_arrow_dictionary_target(self):
pa = pytest.importorskip("pyarrow")
target = Index(
ArrowExtensionArray(
pa.array([1, 2], type=pa.dictionary(pa.int8(), pa.int8()))
)
)
idx = Index([1])
result = idx.get_indexer(target)
expected = np.array([0, -1], dtype=np.int64)
> tm.assert_numpy_array_equal(result, expected)
E AssertionError: numpy array are different
E
E Attribute "dtype" are different
E [left]: int32
E [right]: int64
expected = array([ 0, -1], dtype=int64)
idx = Index([1], dtype='int64')
pa = <module 'pyarrow' from '/usr/lib/python3.11/site-packages/pyarrow/__init__.py'>
result = array([ 0, -1], dtype=int32)
self = <pandas.tests.indexes.numeric.test_indexing.TestGetIndexer object at 0xd9180110>
target = Index([1, 2], dtype='dictionary<values=int8, indices=int8, ordered=0>[pyarrow]')
pandas/tests/indexes/numeric/test_indexing.py:406: AssertionError
______________________________________________________ test_large_string_pyarrow ______________________________________________________
[gw8] linux -- Python 3.11.7 /var/tmp/portage/dev-python/pandas-2.2.0-r1/work/pandas-2.2.0-python3_11/install/usr/bin/python3.11
def test_large_string_pyarrow():
# GH 52795
pa = pytest.importorskip("pyarrow", "11.0.0")
arr = ["Mon", "Tue"]
table = pa.table({"weekday": pa.array(arr, "large_string")})
exchange_df = table.__dataframe__()
result = from_dataframe(exchange_df)
expected = pd.DataFrame({"weekday": ["Mon", "Tue"]})
tm.assert_frame_equal(result, expected)
# check round-trip
> assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
arr = ['Mon', 'Tue']
exchange_df = <pyarrow.interchange.dataframe._PyArrowDataFrame object at 0xd877c7d0>
expected = weekday
0 Mon
1 Tue
pa = <module 'pyarrow' from '/usr/lib/python3.11/site-packages/pyarrow/__init__.py'>
result = weekday
0 Mon
1 Tue
table = pyarrow.Table
weekday: large_string
----
weekday: [["Mon","Tue"]]
pandas/tests/interchange/test_impl.py:104:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/lib/python3.11/site-packages/pyarrow/interchange/from_dataframe.py:113: in from_dataframe
return _from_dataframe(df.__dataframe__(allow_copy=allow_copy),
allow_copy = True
df = weekday
0 Mon
1 Tue
/usr/lib/python3.11/site-packages/pyarrow/interchange/from_dataframe.py:136: in _from_dataframe
batch = protocol_df_chunk_to_pyarrow(chunk, allow_copy)
allow_copy = True
batches = []
chunk = <pandas.core.interchange.dataframe.PandasDataFrameXchg object at 0xccc70410>
df = <pandas.core.interchange.dataframe.PandasDataFrameXchg object at 0xccc70410>
/usr/lib/python3.11/site-packages/pyarrow/interchange/from_dataframe.py:182: in protocol_df_chunk_to_pyarrow
columns[name] = column_to_array(col, allow_copy)
allow_copy = True
col = <pandas.core.interchange.column.PandasColumn object at 0xccc703d0>
columns = {}
df = <pandas.core.interchange.dataframe.PandasDataFrameXchg object at 0xccc70410>
dtype = <DtypeKind.STRING: 21>
name = 'weekday'
/usr/lib/python3.11/site-packages/pyarrow/interchange/from_dataframe.py:214: in column_to_array
data = buffers_to_array(buffers, data_type,
allow_copy = True
buffers = {'data': (PandasBuffer({'bufsize': 6, 'ptr': 3445199088, 'device': 'CPU'}),
(<DtypeKind.STRING: 21>, 8, 'u', '=')),
'offsets': (PandasBuffer({'bufsize': 24, 'ptr': 1546049072, 'device': 'CPU'}),
(<DtypeKind.INT: 0>, 64, 'l', '=')),
'validity': (PandasBuffer({'bufsize': 2, 'ptr': 1544334624, 'device': 'CPU'}),
(<DtypeKind.BOOL: 20>, 8, 'b', '='))}
col = <pandas.core.interchange.column.PandasColumn object at 0xccc703d0>
data_type = (<DtypeKind.STRING: 21>, 8, 'u', '=')
/usr/lib/python3.11/site-packages/pyarrow/interchange/from_dataframe.py:396: in buffers_to_array
data_pa_buffer = pa.foreign_buffer(data_buff.ptr, data_buff.bufsize,
_ = (<DtypeKind.STRING: 21>, 8, 'u', '=')
allow_copy = True
buffers = {'data': (PandasBuffer({'bufsize': 6, 'ptr': 3445199088, 'device': 'CPU'}),
(<DtypeKind.STRING: 21>, 8, 'u', '=')),
'offsets': (PandasBuffer({'bufsize': 24, 'ptr': 1546049072, 'device': 'CPU'}),
(<DtypeKind.INT: 0>, 64, 'l', '=')),
'validity': (PandasBuffer({'bufsize': 2, 'ptr': 1544334624, 'device': 'CPU'}),
(<DtypeKind.BOOL: 20>, 8, 'b', '='))}
data_buff = PandasBuffer({'bufsize': 6, 'ptr': 3445199088, 'device': 'CPU'})
data_type = (<DtypeKind.STRING: 21>, 8, 'u', '=')
describe_null = (<ColumnNullType.USE_BYTEMASK: 4>, 0)
length = 2
offset = 0
offset_buff = PandasBuffer({'bufsize': 24, 'ptr': 1546049072, 'device': 'CPU'})
offset_dtype = (<DtypeKind.INT: 0>, 64, 'l', '=')
validity_buff = PandasBuffer({'bufsize': 2, 'ptr': 1544334624, 'device': 'CPU'})
validity_dtype = (<DtypeKind.BOOL: 20>, 8, 'b', '=')
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
> ???
E OverflowError: Python int too large to convert to C ssize_t
pyarrow/io.pxi:1990: OverflowError
________________________________________ test_join_on_single_col_dup_on_right[string[pyarrow]] ________________________________________
[gw5] linux -- Python 3.11.7 /var/tmp/portage/dev-python/pandas-2.2.0-r1/work/pandas-2.2.0-python3_11/install/usr/bin/python3.11
left_no_dup = a b
0 a cat
1 b dog
2 c weasel
3 d horse
right_w_dups = c
a
<NA> meow
<NA> bark
<NA> um... weasel noise?
<NA> nay
<NA> chirp
e moo
dtype = 'string[pyarrow]'
@pytest.mark.parametrize("dtype", ["object", "string[pyarrow]"])
def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups, dtype):
# GH 46622
# Dups on right allowed by one_to_many constraint
if dtype == "string[pyarrow]":
pytest.importorskip("pyarrow")
left_no_dup = left_no_dup.astype(dtype)
right_w_dups.index = right_w_dups.index.astype(dtype)
> left_no_dup.join(
right_w_dups,
on="a",
validate="one_to_many",
)
dtype = 'string[pyarrow]'
left_no_dup = a b
0 a cat
1 b dog
2 c weasel
3 d horse
right_w_dups = c
a
<NA> meow
<NA> bark
<NA> um... weasel noise?
<NA> nay
<NA> chirp
e moo
pandas/tests/frame/methods/test_join.py:169:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pandas/core/frame.py:10730: in join
return merge(
concat = <function concat at 0xec6cdc58>
how = 'left'
lsuffix = ''
merge = <function merge at 0xec226e88>
on = 'a'
other = c
a
<NA> meow
<NA> bark
<NA> um... weasel noise?
<NA> nay
<NA> chirp
e moo
rsuffix = ''
self = a b
0 a cat
1 b dog
2 c weasel
3 d horse
sort = False
validate = 'one_to_many'
pandas/core/reshape/merge.py:184: in merge
return op.get_result(copy=copy)
copy = None
how = 'left'
indicator = False
left = a b
0 a cat
1 b dog
2 c weasel
3 d horse
left_df = a b
0 a cat
1 b dog
2 c weasel
3 d horse
left_index = False
left_on = 'a'
on = None
op = <pandas.core.reshape.merge._MergeOperation object at 0xcbf60b30>
right = c
a
<NA> meow
<NA> bark
<NA> um... weasel noise?
<NA> nay
<NA> chirp
e moo
right_df = c
a
<NA> meow
<NA> bark
<NA> um... weasel noise?
<NA> nay
<NA> chirp
e moo
right_index = True
right_on = None
sort = False
suffixes = ('', '')
validate = 'one_to_many'
pandas/core/reshape/merge.py:886: in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
copy = None
self = <pandas.core.reshape.merge._MergeOperation object at 0xcbf60b30>
pandas/core/reshape/merge.py:1142: in _get_join_info
join_index, left_indexer, right_indexer = _left_join_on_index(
left_ax = RangeIndex(start=0, stop=4, step=1)
right_ax = Index([<NA>, <NA>, <NA>, <NA>, <NA>, 'e'], dtype='string', name='a')
self = <pandas.core.reshape.merge._MergeOperation object at 0xcbf60b30>
pandas/core/reshape/merge.py:2385: in _left_join_on_index
left_key, right_key, count = _factorize_keys(lkey, rkey, sort=sort)
join_keys = [<ArrowStringArray>
['a', 'b', 'c', 'd']
Length: 4, dtype: string]
left_ax = RangeIndex(start=0, stop=4, step=1)
lkey = <ArrowStringArray>
['a', 'b', 'c', 'd']
Length: 4, dtype: string
right_ax = Index([<NA>, <NA>, <NA>, <NA>, <NA>, 'e'], dtype='string', name='a')
rkey = <ArrowStringArray>
[<NA>, <NA>, <NA>, <NA>, <NA>, 'e']
Length: 6, dtype: string
sort = False
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
lk = <pyarrow.lib.ChunkedArray object at 0xcba48fa0>
[
[
"a",
"b",
"c",
"d"
]
]
rk = <pyarrow.lib.ChunkedArray object at 0xcbf280f0>
[
[
null,
null,
null,
null,
null,
"e"
]
]
sort = False
def _factorize_keys(
lk: ArrayLike, rk: ArrayLike, sort: bool = True
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
"""
Encode left and right keys as enumerated types.
This is used to get the join indexers to be used when merging DataFrames.
Parameters
----------
lk : ndarray, ExtensionArray
Left key.
rk : ndarray, ExtensionArray
Right key.
sort : bool, defaults to True
If True, the encoding is done such that the unique elements in the
keys are sorted.
Returns
-------
np.ndarray[np.intp]
Left (resp. right if called with `key='right'`) labels, as enumerated type.
np.ndarray[np.intp]
Right (resp. left if called with `key='right'`) labels, as enumerated type.
int
Number of unique elements in union of left and right labels.
See Also
--------
merge : Merge DataFrame or named Series objects
with a database-style join.
algorithms.factorize : Encode the object as an enumerated type
or categorical variable.
Examples
--------
>>> lk = np.array(["a", "c", "b"])
>>> rk = np.array(["a", "c"])
Here, the unique values are `'a', 'b', 'c'`. With the default
`sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:
>>> pd.core.reshape.merge._factorize_keys(lk, rk)
(array([0, 2, 1]), array([0, 2]), 3)
With the `sort=False`, the encoding will correspond to the order
in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:
>>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)
(array([0, 1, 2]), array([0, 1]), 3)
"""
# TODO: if either is a RangeIndex, we can likely factorize more efficiently?
if (
isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype)
) or (lib.is_np_dtype(lk.dtype, "M") and lib.is_np_dtype(rk.dtype, "M")):
# Extract the ndarray (UTC-localized) values
# Note: we dont need the dtypes to match, as these can still be compared
lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk)
lk = cast("DatetimeArray", lk)._ndarray
rk = cast("DatetimeArray", rk)._ndarray
elif (
isinstance(lk.dtype, CategoricalDtype)
and isinstance(rk.dtype, CategoricalDtype)
and lk.dtype == rk.dtype
):
assert isinstance(lk, Categorical)
assert isinstance(rk, Categorical)
# Cast rk to encoding so we can compare codes with lk
rk = lk._encode_with_my_categories(rk)
lk = ensure_int64(lk.codes)
rk = ensure_int64(rk.codes)
elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or (
isinstance(lk.dtype, StringDtype)
and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"]
):
import pyarrow as pa
import pyarrow.compute as pc
len_lk = len(lk)
lk = lk._pa_array # type: ignore[attr-defined]
rk = rk._pa_array # type: ignore[union-attr]
dc = (
pa.chunked_array(lk.chunks + rk.chunks) # type: ignore[union-attr]
.combine_chunks()
.dictionary_encode()
)
llab, rlab, count = (
pc.fill_null(dc.indices[slice(len_lk)], -1)
.to_numpy()
.astype(np.intp, copy=False),
pc.fill_null(dc.indices[slice(len_lk, None)], -1)
.to_numpy()
.astype(np.intp, copy=False),
len(dc.dictionary),
)
if sort:
uniques = dc.dictionary.to_numpy(zero_copy_only=False)
llab, rlab = _sort_labels(uniques, llab, rlab)
if dc.null_count > 0:
lmask = llab == -1
lany = lmask.any()
rmask = rlab == -1
rany = rmask.any()
if lany:
np.putmask(llab, lmask, count)
if rany:
> np.putmask(rlab, rmask, count)
E ValueError: putmask: output array is read-only
count = 5
dc = <pyarrow.lib.DictionaryArray object at 0xcb83ffb0>
-- dictionary:
[
"a",
"b",
"c",
"d",
"e"
]
-- indices:
[
0,
1,
2,
3,
null,
null,
null,
null,
null,
4
]
lany = False
len_lk = 4
lk = <pyarrow.lib.ChunkedArray object at 0xcba48fa0>
[
[
"a",
"b",
"c",
"d"
]
]
llab = array([0, 1, 2, 3])
lmask = array([False, False, False, False])
pa = <module 'pyarrow' from '/usr/lib/python3.11/site-packages/pyarrow/__init__.py'>
pc = <module 'pyarrow.compute' from '/usr/lib/python3.11/site-packages/pyarrow/compute.py'>
rany = True
rk = <pyarrow.lib.ChunkedArray object at 0xcbf280f0>
[
[
null,
null,
null,
null,
null,
"e"
]
]
rlab = array([-1, -1, -1, -1, -1, 4])
rmask = array([ True, True, True, True, True, False])
sort = False
pandas/core/reshape/merge.py:2514: ValueError
________________________________________ TestMergeMulti.test_left_join_multi_index[False-True] ________________________________________
[gw3] linux -- Python 3.11.7 /var/tmp/portage/dev-python/pandas-2.2.0-r1/work/pandas-2.2.0-python3_11/install/usr/bin/python3.11
self = <pandas.tests.reshape.merge.test_multi.TestMergeMulti object at 0xd20d8ff0>, sort = False, infer_string = True
@pytest.mark.parametrize(
"infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
)
@pytest.mark.parametrize("sort", [True, False])
def test_left_join_multi_index(self, sort, infer_string):
with option_context("future.infer_string", infer_string):
icols = ["1st", "2nd", "3rd"]
def bind_cols(df):
iord = lambda a: 0 if a != a else ord(a)
f = lambda ts: ts.map(iord) - ord("a")
return f(df["1st"]) + f(df["3rd"]) * 1e2 + df["2nd"].fillna(0) * 10
def run_asserts(left, right, sort):
res = left.join(right, on=icols, how="left", sort=sort)
assert len(left) < len(res) + 1
assert not res["4th"].isna().any()
assert not res["5th"].isna().any()
tm.assert_series_equal(res["4th"], -res["5th"], check_names=False)
result = bind_cols(res.iloc[:, :-2])
tm.assert_series_equal(res["4th"], result, check_names=False)
assert result.name is None
if sort:
tm.assert_frame_equal(res, res.sort_values(icols, kind="mergesort"))
out = merge(left, right.reset_index(), on=icols, sort=sort, how="left")
res.index = RangeIndex(len(res))
tm.assert_frame_equal(out, res)
lc = list(map(chr, np.arange(ord("a"), ord("z") + 1)))
left = DataFrame(
np.random.default_rng(2).choice(lc, (50, 2)), columns=["1st", "3rd"]
)
# Explicit cast to float to avoid implicit cast when setting nan
left.insert(
1,
"2nd",
np.random.default_rng(2).integers(0, 10, len(left)).astype("float"),
)
i = np.random.default_rng(2).permutation(len(left))
right = left.iloc[i].copy()
left["4th"] = bind_cols(left)
right["5th"] = -bind_cols(right)
right.set_index(icols, inplace=True)
run_asserts(left, right, sort)
# inject some nulls
left.loc[1::4, "1st"] = np.nan
left.loc[2::5, "2nd"] = np.nan
left.loc[3::6, "3rd"] = np.nan
left["4th"] = bind_cols(left)
i = np.random.default_rng(2).permutation(len(left))
right = left.iloc[i, :-1]
right["5th"] = -bind_cols(right)
right.set_index(icols, inplace=True)
> run_asserts(left, right, sort)
bind_cols = <function TestMergeMulti.test_left_join_multi_index.<locals>.bind_cols at 0xc9928b18>
i = array([ 6, 40, 33, 38, 7, 46, 28, 45, 5, 34, 12, 18, 27, 3, 9, 39, 42,
23, 0, 26, 4, 10, 14, 41, 16, 43, 15, 48, 13, 24, 20, 25, 22, 49,
2, 11, 32, 44, 47, 17, 19, 37, 21, 29, 31, 30, 35, 36, 8, 1])
icols = ['1st', '2nd', '3rd']
infer_string = True
lc = ['a',
'b',
'c',
'd',
'e',
'f',
'g',
'h',
'i',
'j',
'k',
'l',
'm',
'n',
'o',
'p',
'q',
'r',
's',
't',
'u',
'v',
'w',
'x',
'y',
'z']
left = 1st 2nd 3rd 4th
0 v 8.0 g 701.0
1 NaN 2.0 h 623.0
2 k NaN v 2110.0
3 l 2.0 NaN -9669.0
4 i 4.0 p 1548.0
5 NaN 8.0 s 1783.0
6 z 4.0 e 465.0
7 w NaN b 122.0
8 o 3.0 h 744.0
9 NaN 6.0 NaN -9737.0
10 h 8.0 o 1487.0
11 g 7.0 d 376.0
12 t NaN l 1119.0
13 NaN 1.0 r 1613.0
14 y 8.0 k 1104.0
15 f 0.0 NaN -9695.0
16 y 5.0 z 2574.0
17 NaN NaN r 1603.0
18 j 2.0 k 1029.0
19 b 6.0 e 461.0
20 i 3.0 i 838.0
21 NaN 5.0 NaN -9747.0
22 s NaN x 2318.0
23 w 1.0 u 2032.0
24 z 7.0 i 895.0
25 NaN 4.0 y 2343.0
26 f 6.0 m 1265.0
27 o NaN NaN -9686.0
28 s 9.0 c 308.0
29 NaN 4.0 c 143.0
30 y 2.0 f 544.0
31 l 6.0 w 2271.0
32 n NaN r 1713.0
33 NaN 9.0 NaN -9707.0
34 p 8.0 q 1695.0
35 l 6.0 k 1071.0
36 p 3.0 n 1345.0
37 NaN NaN p 1403.0
38 m 0.0 w 2212.0
39 f 1.0 NaN -9685.0
40 m 3.0 x 2342.0
41 NaN 3.0 p 1433.0
42 b NaN v 2101.0
43 l 5.0 m 1261.0
44 c 6.0 s 1862.0
45 NaN 8.0 NaN -9717.0
46 t 8.0 n 1399.0
47 b NaN f 501.0
48 g 9.0 c 296.0
49 NaN 3.0 b 33.0
right = 5th
1st 2nd 3rd
z 4.0 e -465.0
m 3.0 x -2342.0
nan 9.0 nan 9707.0
m 0.0 w -2212.0
w NaN b -122.0
t 8.0 n -1399.0
s 9.0 c -308.0
nan 8.0 nan 9717.0
s -1783.0
p 8.0 q -1695.0
t NaN l -1119.0
j 2.0 k -1029.0
o NaN nan 9686.0
l 2.0 nan 9669.0
nan 6.0 nan 9737.0
f 1.0 nan 9685.0
b NaN v -2101.0
w 1.0 u -2032.0
v 8.0 g -701.0
f 6.0 m -1265.0
i 4.0 p -1548.0
h 8.0 o -1487.0
y 8.0 k -1104.0
nan 3.0 p -1433.0
y 5.0 z -2574.0
l 5.0 m -1261.0
f 0.0 nan 9695.0
g 9.0 c -296.0
nan 1.0 r -1613.0
z 7.0 i -895.0
i 3.0 i -838.0
nan 4.0 y -2343.0
s NaN x -2318.0
nan 3.0 b -33.0
k NaN v -2110.0
g 7.0 d -376.0
n NaN r -1713.0
c 6.0 s -1862.0
b NaN f -501.0
nan NaN r -1603.0
b 6.0 e -461.0
nan NaN p -1403.0
5.0 nan 9747.0
4.0 c -143.0
l 6.0 w -2271.0
y 2.0 f -544.0
l 6.0 k -1071.0
p 3.0 n -1345.0
o 3.0 h -744.0
nan 2.0 h -623.0
run_asserts = <function TestMergeMulti.test_left_join_multi_index.<locals>.run_asserts at 0xc9928bb8>
self = <pandas.tests.reshape.merge.test_multi.TestMergeMulti object at 0xd20d8ff0>
sort = False
pandas/tests/reshape/merge/test_multi.py:158:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pandas/tests/reshape/merge/test_multi.py:108: in run_asserts
res = left.join(right, on=icols, how="left", sort=sort)
bind_cols = <function TestMergeMulti.test_left_join_multi_index.<locals>.bind_cols at 0xc9928b18>
icols = ['1st', '2nd', '3rd']
left = 1st 2nd 3rd 4th
0 v 8.0 g 701.0
1 NaN 2.0 h 623.0
2 k NaN v 2110.0
3 l 2.0 NaN -9669.0
4 i 4.0 p 1548.0
5 NaN 8.0 s 1783.0
6 z 4.0 e 465.0
7 w NaN b 122.0
8 o 3.0 h 744.0
9 NaN 6.0 NaN -9737.0
10 h 8.0 o 1487.0
11 g 7.0 d 376.0
12 t NaN l 1119.0
13 NaN 1.0 r 1613.0
14 y 8.0 k 1104.0
15 f 0.0 NaN -9695.0
16 y 5.0 z 2574.0
17 NaN NaN r 1603.0
18 j 2.0 k 1029.0
19 b 6.0 e 461.0
20 i 3.0 i 838.0
21 NaN 5.0 NaN -9747.0
22 s NaN x 2318.0
23 w 1.0 u 2032.0
24 z 7.0 i 895.0
25 NaN 4.0 y 2343.0
26 f 6.0 m 1265.0
27 o NaN NaN -9686.0
28 s 9.0 c 308.0
29 NaN 4.0 c 143.0
30 y 2.0 f 544.0
31 l 6.0 w 2271.0
32 n NaN r 1713.0
33 NaN 9.0 NaN -9707.0
34 p 8.0 q 1695.0
35 l 6.0 k 1071.0
36 p 3.0 n 1345.0
37 NaN NaN p 1403.0
38 m 0.0 w 2212.0
39 f 1.0 NaN -9685.0
40 m 3.0 x 2342.0
41 NaN 3.0 p 1433.0
42 b NaN v 2101.0
43 l 5.0 m 1261.0
44 c 6.0 s 1862.0
45 NaN 8.0 NaN -9717.0
46 t 8.0 n 1399.0
47 b NaN f 501.0
48 g 9.0 c 296.0
49 NaN 3.0 b 33.0
right = 5th
1st 2nd 3rd
z 4.0 e -465.0
m 3.0 x -2342.0
nan 9.0 nan 9707.0
m 0.0 w -2212.0
w NaN b -122.0
t 8.0 n -1399.0
s 9.0 c -308.0
nan 8.0 nan 9717.0
s -1783.0
p 8.0 q -1695.0
t NaN l -1119.0
j 2.0 k -1029.0
o NaN nan 9686.0
l 2.0 nan 9669.0
nan 6.0 nan 9737.0
f 1.0 nan 9685.0
b NaN v -2101.0
w 1.0 u -2032.0
v 8.0 g -701.0
f 6.0 m -1265.0
i 4.0 p -1548.0
h 8.0 o -1487.0
y 8.0 k -1104.0
nan 3.0 p -1433.0
y 5.0 z -2574.0
l 5.0 m -1261.0
f 0.0 nan 9695.0
g 9.0 c -296.0
nan 1.0 r -1613.0
z 7.0 i -895.0
i 3.0 i -838.0
nan 4.0 y -2343.0
s NaN x -2318.0
nan 3.0 b -33.0
k NaN v -2110.0
g 7.0 d -376.0
n NaN r -1713.0
c 6.0 s -1862.0
b NaN f -501.0
nan NaN r -1603.0
b 6.0 e -461.0
nan NaN p -1403.0
5.0 nan 9747.0
4.0 c -143.0
l 6.0 w -2271.0
y 2.0 f -544.0
l 6.0 k -1071.0
p 3.0 n -1345.0
o 3.0 h -744.0
nan 2.0 h -623.0
sort = False
pandas/core/frame.py:10730: in join
return merge(
concat = <function concat at 0xec6bec58>
how = 'left'
lsuffix = ''
merge = <function merge at 0xec217e88>
on = ['1st', '2nd', '3rd']
other = 5th
1st 2nd 3rd
z 4.0 e -465.0
m 3.0 x -2342.0
nan 9.0 nan 9707.0
m 0.0 w -2212.0
w NaN b -122.0
t 8.0 n -1399.0
s 9.0 c -308.0
nan 8.0 nan 9717.0
s -1783.0
p 8.0 q -1695.0
t NaN l -1119.0
j 2.0 k -1029.0
o NaN nan 9686.0
l 2.0 nan 9669.0
nan 6.0 nan 9737.0
f 1.0 nan 9685.0
b NaN v -2101.0
w 1.0 u -2032.0
v 8.0 g -701.0
f 6.0 m -1265.0
i 4.0 p -1548.0
h 8.0 o -1487.0
y 8.0 k -1104.0
nan 3.0 p -1433.0
y 5.0 z -2574.0
l 5.0 m -1261.0
f 0.0 nan 9695.0
g 9.0 c -296.0
nan 1.0 r -1613.0
z 7.0 i -895.0
i 3.0 i -838.0
nan 4.0 y -2343.0
s NaN x -2318.0
nan 3.0 b -33.0
k NaN v -2110.0
g 7.0 d -376.0
n NaN r -1713.0
c 6.0 s -1862.0
b NaN f -501.0
nan NaN r -1603.0
b 6.0 e -461.0
nan NaN p -1403.0
5.0 nan 9747.0
4.0 c -143.0
l 6.0 w -2271.0
y 2.0 f -544.0
l 6.0 k -1071.0
p 3.0 n -1345.0
o 3.0 h -744.0
nan 2.0 h -623.0
rsuffix = ''
self = 1st 2nd 3rd 4th
0 v 8.0 g 701.0
1 NaN 2.0 h 623.0
2 k NaN v 2110.0
3 l 2.0 NaN -9669.0
4 i 4.0 p 1548.0
5 NaN 8.0 s 1783.0
6 z 4.0 e 465.0
7 w NaN b 122.0
8 o 3.0 h 744.0
9 NaN 6.0 NaN -9737.0
10 h 8.0 o 1487.0
11 g 7.0 d 376.0
12 t NaN l 1119.0
13 NaN 1.0 r 1613.0
14 y 8.0 k 1104.0
15 f 0.0 NaN -9695.0
16 y 5.0 z 2574.0
17 NaN NaN r 1603.0
18 j 2.0 k 1029.0
19 b 6.0 e 461.0
20 i 3.0 i 838.0
21 NaN 5.0 NaN -9747.0
22 s NaN x 2318.0
23 w 1.0 u 2032.0
24 z 7.0 i 895.0
25 NaN 4.0 y 2343.0
26 f 6.0 m 1265.0
27 o NaN NaN -9686.0
28 s 9.0 c 308.0
29 NaN 4.0 c 143.0
30 y 2.0 f 544.0
31 l 6.0 w 2271.0
32 n NaN r 1713.0
33 NaN 9.0 NaN -9707.0
34 p 8.0 q 1695.0
35 l 6.0 k 1071.0
36 p 3.0 n 1345.0
37 NaN NaN p 1403.0
38 m 0.0 w 2212.0
39 f 1.0 NaN -9685.0
40 m 3.0 x 2342.0
41 NaN 3.0 p 1433.0
42 b NaN v 2101.0
43 l 5.0 m 1261.0
44 c 6.0 s 1862.0
45 NaN 8.0 NaN -9717.0
46 t 8.0 n 1399.0
47 b NaN f 501.0
48 g 9.0 c 296.0
49 NaN 3.0 b 33.0
sort = False
validate = None
pandas/core/reshape/merge.py:184: in merge
return op.get_result(copy=copy)
copy = None
how = 'left'
indicator = False
left = 1st 2nd 3rd 4th
0 v 8.0 g 701.0
1 NaN 2.0 h 623.0
2 k NaN v 2110.0
3 l 2.0 NaN -9669.0
4 i 4.0 p 1548.0
5 NaN 8.0 s 1783.0
6 z 4.0 e 465.0
7 w NaN b 122.0
8 o 3.0 h 744.0
9 NaN 6.0 NaN -9737.0
10 h 8.0 o 1487.0
11 g 7.0 d 376.0
12 t NaN l 1119.0
13 NaN 1.0 r 1613.0
14 y 8.0 k 1104.0
15 f 0.0 NaN -9695.0
16 y 5.0 z 2574.0
17 NaN NaN r 1603.0
18 j 2.0 k 1029.0
19 b 6.0 e 461.0
20 i 3.0 i 838.0
21 NaN 5.0 NaN -9747.0
22 s NaN x 2318.0
23 w 1.0 u 2032.0
24 z 7.0 i 895.0
25 NaN 4.0 y 2343.0
26 f 6.0 m 1265.0
27 o NaN NaN -9686.0
28 s 9.0 c 308.0
29 NaN 4.0 c 143.0
30 y 2.0 f 544.0
31 l 6.0 w 2271.0
32 n NaN r 1713.0
33 NaN 9.0 NaN -9707.0
34 p 8.0 q 1695.0
35 l 6.0 k 1071.0
36 p 3.0 n 1345.0
37 NaN NaN p 1403.0
38 m 0.0 w 2212.0
39 f 1.0 NaN -9685.0
40 m 3.0 x 2342.0
41 NaN 3.0 p 1433.0
42 b NaN v 2101.0
43 l 5.0 m 1261.0
44 c 6.0 s 1862.0
45 NaN 8.0 NaN -9717.0
46 t 8.0 n 1399.0
47 b NaN f 501.0
48 g 9.0 c 296.0
49 NaN 3.0 b 33.0
left_df = 1st 2nd 3rd 4th
0 v 8.0 g 701.0
1 NaN 2.0 h 623.0
2 k NaN v 2110.0
3 l 2.0 NaN -9669.0
4 i 4.0 p 1548.0
5 NaN 8.0 s 1783.0
6 z 4.0 e 465.0
7 w NaN b 122.0
8 o 3.0 h 744.0
9 NaN 6.0 NaN -9737.0
10 h 8.0 o 1487.0
11 g 7.0 d 376.0
12 t NaN l 1119.0
13 NaN 1.0 r 1613.0
14 y 8.0 k 1104.0
15 f 0.0 NaN -9695.0
16 y 5.0 z 2574.0
17 NaN NaN r 1603.0
18 j 2.0 k 1029.0
19 b 6.0 e 461.0
20 i 3.0 i 838.0
21 NaN 5.0 NaN -9747.0
22 s NaN x 2318.0
23 w 1.0 u 2032.0
24 z 7.0 i 895.0
25 NaN 4.0 y 2343.0
26 f 6.0 m 1265.0
27 o NaN NaN -9686.0
28 s 9.0 c 308.0
29 NaN 4.0 c 143.0
30 y 2.0 f 544.0
31 l 6.0 w 2271.0
32 n NaN r 1713.0
33 NaN 9.0 NaN -9707.0
34 p 8.0 q 1695.0
35 l 6.0 k 1071.0
36 p 3.0 n 1345.0
37 NaN NaN p 1403.0
38 m 0.0 w 2212.0
39 f 1.0 NaN -9685.0
40 m 3.0 x 2342.0
41 NaN 3.0 p 1433.0
42 b NaN v 2101.0
43 l 5.0 m 1261.0
44 c 6.0 s 1862.0
45 NaN 8.0 NaN -9717.0
46 t 8.0 n 1399.0
47 b NaN f 501.0
48 g 9.0 c 296.0
49 NaN 3.0 b 33.0
left_index = False
left_on = ['1st', '2nd', '3rd']
on = None
op = <pandas.core.reshape.merge._MergeOperation object at 0xc9857b30>
right = 5th
1st 2nd 3rd
z 4.0 e -465.0
m 3.0 x -2342.0
nan 9.0 nan 9707.0
m 0.0 w -2212.0
w NaN b -122.0
t 8.0 n -1399.0
s 9.0 c -308.0
nan 8.0 nan 9717.0
s -1783.0
p 8.0 q -1695.0
t NaN l -1119.0
j 2.0 k -1029.0
o NaN nan 9686.0
l 2.0 nan 9669.0
nan 6.0 nan 9737.0
f 1.0 nan 9685.0
b NaN v -2101.0
w 1.0 u -2032.0
v 8.0 g -701.0
f 6.0 m -1265.0
i 4.0 p -1548.0
h 8.0 o -1487.0
y 8.0 k -1104.0
nan 3.0 p -1433.0
y 5.0 z -2574.0
l 5.0 m -1261.0
f 0.0 nan 9695.0
g 9.0 c -296.0
nan 1.0 r -1613.0
z 7.0 i -895.0
i 3.0 i -838.0
nan 4.0 y -2343.0
s NaN x -2318.0
nan 3.0 b -33.0
k NaN v -2110.0
g 7.0 d -376.0
n NaN r -1713.0
c 6.0 s -1862.0
b NaN f -501.0
nan NaN r -1603.0
b 6.0 e -461.0
nan NaN p -1403.0
5.0 nan 9747.0
4.0 c -143.0
l 6.0 w -2271.0
y 2.0 f -544.0
l 6.0 k -1071.0
p 3.0 n -1345.0
o 3.0 h -744.0
nan 2.0 h -623.0
right_df = 5th
1st 2nd 3rd
z 4.0 e -465.0
m 3.0 x -2342.0
nan 9.0 nan 9707.0
m 0.0 w -2212.0
w NaN b -122.0
t 8.0 n -1399.0
s 9.0 c -308.0
nan 8.0 nan 9717.0
s -1783.0
p 8.0 q -1695.0
t NaN l -1119.0
j 2.0 k -1029.0
o NaN nan 9686.0
l 2.0 nan 9669.0
nan 6.0 nan 9737.0
f 1.0 nan 9685.0
b NaN v -2101.0
w 1.0 u -2032.0
v 8.0 g -701.0
f 6.0 m -1265.0
i 4.0 p -1548.0
h 8.0 o -1487.0
y 8.0 k -1104.0
nan 3.0 p -1433.0
y 5.0 z -2574.0
l 5.0 m -1261.0
f 0.0 nan 9695.0
g 9.0 c -296.0
nan 1.0 r -1613.0
z 7.0 i -895.0
i 3.0 i -838.0
nan 4.0 y -2343.0
s NaN x -2318.0
nan 3.0 b -33.0
k NaN v -2110.0
g 7.0 d -376.0
n NaN r -1713.0
c 6.0 s -1862.0
b NaN f -501.0
nan NaN r -1603.0
b 6.0 e -461.0
nan NaN p -1403.0
5.0 nan 9747.0
4.0 c -143.0
l 6.0 w -2271.0
y 2.0 f -544.0
l 6.0 k -1071.0
p 3.0 n -1345.0
o 3.0 h -744.0
nan 2.0 h -623.0
right_index = True
right_on = None
sort = False
suffixes = ('', '')
validate = None
pandas/core/reshape/merge.py:886: in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
copy = None
self = <pandas.core.reshape.merge._MergeOperation object at 0xc9857b30>
pandas/core/reshape/merge.py:1142: in _get_join_info
join_index, left_indexer, right_indexer = _left_join_on_index(
left_ax = RangeIndex(start=0, stop=50, step=1)
right_ax = MultiIndex([('z', 4.0, 'e'),
('m', 3.0, 'x'),
(nan, 9.0, nan),
('m', 0.0, 'w'),
('w', nan, 'b'),
('t', 8.0, 'n'),
('s', 9.0, 'c'),
(nan, 8.0, nan),
(nan, 8.0, 's'),
('p', 8.0, 'q'),
('t', nan, 'l'),
('j', 2.0, 'k'),
('o', nan, nan),
('l', 2.0, nan),
(nan, 6.0, nan),
('f', 1.0, nan),
('b', nan, 'v'),
('w', 1.0, 'u'),
('v', 8.0, 'g'),
('f', 6.0, 'm'),
('i', 4.0, 'p'),
('h', 8.0, 'o'),
('y', 8.0, 'k'),
(nan, 3.0, 'p'),
('y', 5.0, 'z'),
('l', 5.0, 'm'),
('f', 0.0, nan),
('g', 9.0, 'c'),
(nan, 1.0, 'r'),
('z', 7.0, 'i'),
('i', 3.0, 'i'),
(nan, 4.0, 'y'),
('s', nan, 'x'),
(nan, 3.0, 'b'),
('k', nan, 'v'),
('g', 7.0, 'd'),
('n', nan, 'r'),
('c', 6.0, 's'),
('b', nan, 'f'),
(nan, nan, 'r'),
('b', 6.0, 'e'),
(nan, nan, 'p'),
(nan, 5.0, nan),
(nan, 4.0, 'c'),
('l', 6.0, 'w'),
('y', 2.0, 'f'),
('l', 6.0, 'k'),
('p', 3.0, 'n'),
('o', 3.0, 'h'),
(nan, 2.0, 'h')],
names=['1st', '2nd', '3rd'])
self = <pandas.core.reshape.merge._MergeOperation object at 0xc9857b30>
pandas/core/reshape/merge.py:2375: in _left_join_on_index
lkey, rkey = _get_multiindex_indexer(join_keys, right_ax, sort=sort)
join_keys = [<ArrowStringArrayNumpySemantics>
['v', nan, 'k', 'l', 'i', nan, 'z', 'w', 'o', nan, 'h', 'g', 't', nan, 'y',
'f', 'y', nan, 'j', 'b', 'i', nan, 's', 'w', 'z', nan, 'f', 'o', 's', nan,
'y', 'l', 'n', nan, 'p', 'l', 'p', nan, 'm', 'f', 'm', nan, 'b', 'l', 'c',
nan, 't', 'b', 'g', nan]
Length: 50, dtype: string,
array([ 8., 2., nan, 2., 4., 8., 4., nan, 3., 6., 8., 7., nan,
1., 8., 0., 5., nan, 2., 6., 3., 5., nan, 1., 7., 4.,
6., nan, 9., 4., 2., 6., nan, 9., 8., 6., 3., nan, 0.,
1., 3., 3., nan, 5., 6., 8., 8., nan, 9., 3.]),
<ArrowStringArrayNumpySemantics>
['g', 'h', 'v', nan, 'p', 's', 'e', 'b', 'h', nan, 'o', 'd', 'l', 'r', 'k',
nan, 'z', 'r', 'k', 'e', 'i', nan, 'x', 'u', 'i', 'y', 'm', nan, 'c', 'c',
'f', 'w', 'r', nan, 'q', 'k', 'n', 'p', 'w', nan, 'x', 'p', 'v', 'm', 's',
nan, 'n', 'f', 'c', 'b']
Length: 50, dtype: string]
left_ax = RangeIndex(start=0, stop=50, step=1)
right_ax = MultiIndex([('z', 4.0, 'e'),
('m', 3.0, 'x'),
(nan, 9.0, nan),
('m', 0.0, 'w'),
('w', nan, 'b'),
('t', 8.0, 'n'),
('s', 9.0, 'c'),
(nan, 8.0, nan),
(nan, 8.0, 's'),
('p', 8.0, 'q'),
('t', nan, 'l'),
('j', 2.0, 'k'),
('o', nan, nan),
('l', 2.0, nan),
(nan, 6.0, nan),
('f', 1.0, nan),
('b', nan, 'v'),
('w', 1.0, 'u'),
('v', 8.0, 'g'),
('f', 6.0, 'm'),
('i', 4.0, 'p'),
('h', 8.0, 'o'),
('y', 8.0, 'k'),
(nan, 3.0, 'p'),
('y', 5.0, 'z'),
('l', 5.0, 'm'),
('f', 0.0, nan),
('g', 9.0, 'c'),
(nan, 1.0, 'r'),
('z', 7.0, 'i'),
('i', 3.0, 'i'),
(nan, 4.0, 'y'),
('s', nan, 'x'),
(nan, 3.0, 'b'),
('k', nan, 'v'),
('g', 7.0, 'd'),
('n', nan, 'r'),
('c', 6.0, 's'),
('b', nan, 'f'),
(nan, nan, 'r'),
('b', 6.0, 'e'),
(nan, nan, 'p'),
(nan, 5.0, nan),
(nan, 4.0, 'c'),
('l', 6.0, 'w'),
('y', 2.0, 'f'),
('l', 6.0, 'k'),
('p', 3.0, 'n'),
('o', 3.0, 'h'),
(nan, 2.0, 'h')],
names=['1st', '2nd', '3rd'])
sort = False
pandas/core/reshape/merge.py:2309: in _get_multiindex_indexer
zipped = zip(*mapped)
index = MultiIndex([('z', 4.0, 'e'),
('m', 3.0, 'x'),
(nan, 9.0, nan),
('m', 0.0, 'w'),
('w', nan, 'b'),
('t', 8.0, 'n'),
('s', 9.0, 'c'),
(nan, 8.0, nan),
(nan, 8.0, 's'),
('p', 8.0, 'q'),
('t', nan, 'l'),
('j', 2.0, 'k'),
('o', nan, nan),
('l', 2.0, nan),
(nan, 6.0, nan),
('f', 1.0, nan),
('b', nan, 'v'),
('w', 1.0, 'u'),
('v', 8.0, 'g'),
('f', 6.0, 'm'),
('i', 4.0, 'p'),
('h', 8.0, 'o'),
('y', 8.0, 'k'),
(nan, 3.0, 'p'),
('y', 5.0, 'z'),
('l', 5.0, 'm'),
('f', 0.0, nan),
('g', 9.0, 'c'),
(nan, 1.0, 'r'),
('z', 7.0, 'i'),
('i', 3.0, 'i'),
(nan, 4.0, 'y'),
('s', nan, 'x'),
(nan, 3.0, 'b'),
('k', nan, 'v'),
('g', 7.0, 'd'),
('n', nan, 'r'),
('c', 6.0, 's'),
('b', nan, 'f'),
(nan, nan, 'r'),
('b', 6.0, 'e'),
(nan, nan, 'p'),
(nan, 5.0, nan),
(nan, 4.0, 'c'),
('l', 6.0, 'w'),
('y', 2.0, 'f'),
('l', 6.0, 'k'),
('p', 3.0, 'n'),
('o', 3.0, 'h'),
(nan, 2.0, 'h')],
names=['1st', '2nd', '3rd'])
join_keys = [<ArrowStringArrayNumpySemantics>
['v', nan, 'k', 'l', 'i', nan, 'z', 'w', 'o', nan, 'h', 'g', 't', nan, 'y',
'f', 'y', nan, 'j', 'b', 'i', nan, 's', 'w', 'z', nan, 'f', 'o', 's', nan,
'y', 'l', 'n', nan, 'p', 'l', 'p', nan, 'm', 'f', 'm', nan, 'b', 'l', 'c',
nan, 't', 'b', 'g', nan]
Length: 50, dtype: string,
array([ 8., 2., nan, 2., 4., 8., 4., nan, 3., 6., 8., 7., nan,
1., 8., 0., 5., nan, 2., 6., 3., 5., nan, 1., 7., 4.,
6., nan, 9., 4., 2., 6., nan, 9., 8., 6., 3., nan, 0.,
1., 3., 3., nan, 5., 6., 8., 8., nan, 9., 3.]),
<ArrowStringArrayNumpySemantics>
['g', 'h', 'v', nan, 'p', 's', 'e', 'b', 'h', nan, 'o', 'd', 'l', 'r', 'k',
nan, 'z', 'r', 'k', 'e', 'i', nan, 'x', 'u', 'i', 'y', 'm', nan, 'c', 'c',
'f', 'w', 'r', nan, 'q', 'k', 'n', 'p', 'w', nan, 'x', 'p', 'v', 'm', 's',
nan, 'n', 'f', 'c', 'b']
Length: 50, dtype: string]
mapped = <generator object _get_multiindex_indexer.<locals>.<genexpr> at 0xc9b7a710>
sort = False
pandas/core/reshape/merge.py:2306: in <genexpr>
_factorize_keys(index.levels[n]._values, join_keys[n], sort=sort)
.0 = <range_iterator object at 0xc9856710>
index = MultiIndex([('z', 4.0, 'e'),
('m', 3.0, 'x'),
(nan, 9.0, nan),
('m', 0.0, 'w'),
('w', nan, 'b'),
('t', 8.0, 'n'),
('s', 9.0, 'c'),
(nan, 8.0, nan),
(nan, 8.0, 's'),
('p', 8.0, 'q'),
('t', nan, 'l'),
('j', 2.0, 'k'),
('o', nan, nan),
('l', 2.0, nan),
(nan, 6.0, nan),
('f', 1.0, nan),
('b', nan, 'v'),
('w', 1.0, 'u'),
('v', 8.0, 'g'),
('f', 6.0, 'm'),
('i', 4.0, 'p'),
('h', 8.0, 'o'),
('y', 8.0, 'k'),
(nan, 3.0, 'p'),
('y', 5.0, 'z'),
('l', 5.0, 'm'),
('f', 0.0, nan),
('g', 9.0, 'c'),
(nan, 1.0, 'r'),
('z', 7.0, 'i'),
('i', 3.0, 'i'),
(nan, 4.0, 'y'),
('s', nan, 'x'),
(nan, 3.0, 'b'),
('k', nan, 'v'),
('g', 7.0, 'd'),
('n', nan, 'r'),
('c', 6.0, 's'),
('b', nan, 'f'),
(nan, nan, 'r'),
('b', 6.0, 'e'),
(nan, nan, 'p'),
(nan, 5.0, nan),
(nan, 4.0, 'c'),
('l', 6.0, 'w'),
('y', 2.0, 'f'),
('l', 6.0, 'k'),
('p', 3.0, 'n'),
('o', 3.0, 'h'),
(nan, 2.0, 'h')],
names=['1st', '2nd', '3rd'])
join_keys = [<ArrowStringArrayNumpySemantics>
['v', nan, 'k', 'l', 'i', nan, 'z', 'w', 'o', nan, 'h', 'g', 't', nan, 'y',
'f', 'y', nan, 'j', 'b', 'i', nan, 's', 'w', 'z', nan, 'f', 'o', 's', nan,
'y', 'l', 'n', nan, 'p', 'l', 'p', nan, 'm', 'f', 'm', nan, 'b', 'l', 'c',
nan, 't', 'b', 'g', nan]
Length: 50, dtype: string,
array([ 8., 2., nan, 2., 4., 8., 4., nan, 3., 6., 8., 7., nan,
1., 8., 0., 5., nan, 2., 6., 3., 5., nan, 1., 7., 4.,
6., nan, 9., 4., 2., 6., nan, 9., 8., 6., 3., nan, 0.,
1., 3., 3., nan, 5., 6., 8., 8., nan, 9., 3.]),
<ArrowStringArrayNumpySemantics>
['g', 'h', 'v', nan, 'p', 's', 'e', 'b', 'h', nan, 'o', 'd', 'l', 'r', 'k',
nan, 'z', 'r', 'k', 'e', 'i', nan, 'x', 'u', 'i', 'y', 'm', nan, 'c', 'c',
'f', 'w', 'r', nan, 'q', 'k', 'n', 'p', 'w', nan, 'x', 'p', 'v', 'm', 's',
nan, 'n', 'f', 'c', 'b']
Length: 50, dtype: string]
n = 0
sort = False
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
lk = <pyarrow.lib.ChunkedArray object at 0xc983bb18>
[
[
"b",
"c",
"f",
"g",
"h",
...
"t",
"v",
"w",
"y",
"z"
]
]
rk = <pyarrow.lib.ChunkedArray object at 0xc984c7f8>
[
[
"v",
null,
"k",
"l",
"i",
...
null,
"t",
"b",
"g",
null
]
]
sort = False
def _factorize_keys(
lk: ArrayLike, rk: ArrayLike, sort: bool = True
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
"""
Encode left and right keys as enumerated types.
This is used to get the join indexers to be used when merging DataFrames.
Parameters
----------
lk : ndarray, ExtensionArray
Left key.
rk : ndarray, ExtensionArray
Right key.
sort : bool, defaults to True
If True, the encoding is done such that the unique elements in the
keys are sorted.
Returns
-------
np.ndarray[np.intp]
Left (resp. right if called with `key='right'`) labels, as enumerated type.
np.ndarray[np.intp]
Right (resp. left if called with `key='right'`) labels, as enumerated type.
int
Number of unique elements in union of left and right labels.
See Also
--------
merge : Merge DataFrame or named Series objects
with a database-style join.
algorithms.factorize : Encode the object as an enumerated type
or categorical variable.
Examples
--------
>>> lk = np.array(["a", "c", "b"])
>>> rk = np.array(["a", "c"])
Here, the unique values are `'a', 'b', 'c'`. With the default
`sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:
>>> pd.core.reshape.merge._factorize_keys(lk, rk)
(array([0, 2, 1]), array([0, 2]), 3)
With the `sort=False`, the encoding will correspond to the order
in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:
>>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)
(array([0, 1, 2]), array([0, 1]), 3)
"""
# TODO: if either is a RangeIndex, we can likely factorize more efficiently?
if (
isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype)
) or (lib.is_np_dtype(lk.dtype, "M") and lib.is_np_dtype(rk.dtype, "M")):
# Extract the ndarray (UTC-localized) values
# Note: we dont need the dtypes to match, as these can still be compared
lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk)
lk = cast("DatetimeArray", lk)._ndarray
rk = cast("DatetimeArray", rk)._ndarray
elif (
isinstance(lk.dtype, CategoricalDtype)
and isinstance(rk.dtype, CategoricalDtype)
and lk.dtype == rk.dtype
):
assert isinstance(lk, Categorical)
assert isinstance(rk, Categorical)
# Cast rk to encoding so we can compare codes with lk
rk = lk._encode_with_my_categories(rk)
lk = ensure_int64(lk.codes)
rk = ensure_int64(rk.codes)
elif isinstance(lk, ExtensionArray) and lk.dtype == rk.dtype:
if (isinstance(lk.dtype, ArrowDtype) and is_string_dtype(lk.dtype)) or (
isinstance(lk.dtype, StringDtype)
and lk.dtype.storage in ["pyarrow", "pyarrow_numpy"]
):
import pyarrow as pa
import pyarrow.compute as pc
len_lk = len(lk)
lk = lk._pa_array # type: ignore[attr-defined]
rk = rk._pa_array # type: ignore[union-attr]
dc = (
pa.chunked_array(lk.chunks + rk.chunks) # type: ignore[union-attr]
.combine_chunks()
.dictionary_encode()
)
llab, rlab, count = (
pc.fill_null(dc.indices[slice(len_lk)], -1)
.to_numpy()
.astype(np.intp, copy=False),
pc.fill_null(dc.indices[slice(len_lk, None)], -1)
.to_numpy()
.astype(np.intp, copy=False),
len(dc.dictionary),
)
if sort:
uniques = dc.dictionary.to_numpy(zero_copy_only=False)
llab, rlab = _sort_labels(uniques, llab, rlab)
if dc.null_count > 0:
lmask = llab == -1
lany = lmask.any()
rmask = rlab == -1
rany = rmask.any()
if lany:
np.putmask(llab, lmask, count)
if rany:
> np.putmask(rlab, rmask, count)
E ValueError: putmask: output array is read-only
count = 19
dc = <pyarrow.lib.DictionaryArray object at 0xc9083ed0>
-- dictionary:
[
"b",
"c",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"s",
"t",
"v",
"w",
"y",
"z"
]
-- indices:
[
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
...
9,
null,
0,
8,
1,
null,
14,
0,
3,
null
]
lany = False
len_lk = 19
lk = <pyarrow.lib.ChunkedArray object at 0xc983bb18>
[
[
"b",
"c",
"f",
"g",
"h",
...
"t",
"v",
"w",
"y",
"z"
]
]
llab = array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18])
lmask = array([False, False, False, False, False, False, False, False, False,
False, False, False, False, False, False, False, False, False,
False])
pa = <module 'pyarrow' from '/usr/lib/python3.11/site-packages/pyarrow/__init__.py'>
pc = <module 'pyarrow.compute' from '/usr/lib/python3.11/site-packages/pyarrow/compute.py'>
rany = True
rk = <pyarrow.lib.ChunkedArray object at 0xc984c7f8>
[
[
"v",
null,
"k",
"l",
"i",
...
null,
"t",
"b",
"g",
null
]
]
rlab = array([15, -1, 7, 8, 5, -1, 18, 16, 11, -1, 4, 3, 14, -1, 17, 2, 17,
-1, 6, 0, 5, -1, 13, 16, 18, -1, 2, 11, 13, -1, 17, 8, 10, -1,
12, 8, 12, -1, 9, 2, 9, -1, 0, 8, 1, -1, 14, 0, 3, -1])
rmask = array([False, True, False, False, False, True, False, False, False,
True, False, False, False, True, False, False, False, True,
False, False, False, True, False, False, False, True, False,
False, False, True, False, False, False, True, False, False,
False, True, False, False, False, True, False, False, False,
True, False, False, False, True])
sort = False
pandas/core/reshape/merge.py:2514: ValueError
Full build & test log (2.5M .gz, 52M uncompressed): pandas.txt.gz
This is on Gentoo/x86 systemd-nspawn container. I'm using -O2 -march=pentium-m -mfpmath=sse -pipe
flags to rule out i387-specific precision issues.
I've also filed apache/arrow#40153 for test failures in pyarrow itself. Some of them could be possibly be bugs in pandas instead.
Expected Behavior
Tests passing ;-).
Installed Versions
INSTALLED VERSIONS
commit : fd3f571
python : 3.11.7.final.0
python-bits : 32
OS : Linux
OS-release : 6.7.5-gentoo-dist
Version : #1 SMP PREEMPT_DYNAMIC Sat Feb 17 07:30:27 -00 2024
machine : x86_64
processor : AMD Ryzen 5 3600 6-Core Processor
byteorder : little
LC_ALL : None
LANG : C.UTF8
LOCALE : en_US.UTF-8
pandas : 2.2.0
numpy : 1.26.4
pytz : 2024.1
dateutil : 2.8.2
setuptools : 69.0.3
pip : None
Cython : 3.0.5
pytest : 7.4.4
hypothesis : 6.98.3
sphinx : None
blosc : None
feather : None
xlsxwriter : 3.2.0
lxml.etree : 4.9.4
html5lib : 1.1
pymysql : 1.4.6
psycopg2 : None
jinja2 : 3.1.3
IPython : None
pandas_datareader : None
adbc-driver-postgresql: None
adbc-driver-sqlite : None
bs4 : 4.12.3
bottleneck : 1.3.7
dataframe-api-compat : None
fastparquet : None
fsspec : None
gcsfs : None
matplotlib : 3.8.3
numba : None
numexpr : 2.9.0
odfpy : None
openpyxl : 3.1.2
pandas_gbq : None
pyarrow : 15.0.0
pyreadstat : None
python-calamine : None
pyxlsb : None
s3fs : None
scipy : None
sqlalchemy : 2.0.27
tables : 3.9.2
tabulate : 0.9.0
xarray : 2024.2.0
xlrd : 2.0.1
zstandard : None
tzdata : None
qtpy : None
pyqt5 : None