Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert result of group by agg to pyarrow if input is pyarrow #58129

Closed
Changes from 1 commit
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
9faa460
Set preserve_dtype flag for bool type only when result is also bool
Apr 1, 2024
969d5b1
Update implementation to change type to pyarrow only
Apr 2, 2024
66114f3
Change import order
Apr 2, 2024
b0290ed
Convert numpy array to pandas representation of pyarrow array
Apr 3, 2024
20c8fa0
Add tests
Apr 3, 2024
97b3d54
Merge branch 'main' into fix/group_by_agg_pyarrow_bool_numpy_same_type
Apr 3, 2024
932d737
Change pyarrow to optional import in agg_series() method
Apr 5, 2024
82ddeb5
Seperate tests
Apr 5, 2024
d510052
Merge branch 'main' into fix/group_by_agg_pyarrow_bool_numpy_same_type
Apr 5, 2024
62a31d9
Merge branch 'main' into fix/group_by_agg_pyarrow_bool_numpy_same_type
Apr 8, 2024
a54bf58
Revert to old implementation
Apr 8, 2024
64330f0
Update implementation to use pyarrow array method
Apr 8, 2024
0647711
Update test_aggregate tests
Apr 8, 2024
affde38
Move pyarrow import to top of method
Apr 8, 2024
842f561
Update according to pr comments
Apr 12, 2024
93b5bf3
Merge branch 'main' into fix/group_by_agg_pyarrow_bool_numpy_same_type
Apr 20, 2024
6f35c0e
Fallback convert to input dtype is output is all nan or empty array
Apr 20, 2024
abd0adf
Strip na values when inferring pyarrow dtype
Apr 20, 2024
bebc442
Update tests to check expected inferred dtype instead of inputy dtype
Apr 20, 2024
bb6343b
Override test case for test_arrow.py
Apr 21, 2024
3a3f2a2
Merge branch 'main' into fix/group_by_agg_pyarrow_bool_numpy_same_type
Apr 21, 2024
6dc40f5
Empty commit to trigger build run
Apr 21, 2024
4ef96f7
In agg series, convert to np values, then cast to pyarrow dtype, acco…
Apr 23, 2024
c6a98c0
Update tests
Apr 23, 2024
9181eaf
Update rst docs
Apr 25, 2024
612d7d0
Update impl to fix tests
Apr 25, 2024
3b6696b
Declare variable in outer scope
Apr 25, 2024
680e238
Update impl to use maybe_cast_pointwise_result instead of maybe_cast…
Apr 29, 2024
3a8597e
Fix tests with nested array
Apr 29, 2024
6496b15
Update according to pr comments
May 2, 2024
712c36a
Merge branch 'main' into fix/group_by_agg_pyarrow_bool_numpy_same_type
May 2, 2024
e1ccef6
Preserve_dtype if argument is passed in, else don't preserve
May 7, 2024
0ce083d
Merge branch 'main' into fix/group_by_agg_pyarrow_bool_numpy_same_type
undermyumbrella1 May 7, 2024
a1d73f5
Update tests
May 7, 2024
57845a8
Merge branch 'fix/group_by_agg_pyarrow_bool_numpy_same_type' of githu…
May 7, 2024
fa257b0
Remove redundant tests
undermyumbrella1 May 12, 2024
0a9b83f
Merge branch 'main' into fix/group_by_agg_pyarrow_bool_numpy_same_type
undermyumbrella1 May 12, 2024
139319a
retrigger pipeline
undermyumbrella1 May 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update tests
  • Loading branch information
Kei committed Apr 23, 2024
commit c6a98c0e0f4a4cd130aca2cb90f105f8bbf4135e
40 changes: 19 additions & 21 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1886,46 +1886,48 @@ def test_agg_lambda_pyarrow_to_same_data_type():
def test_agg_lambda_float64_pyarrow_dtype_conversion():
# test numpy dtype conversion back to pyarrow dtype
# complexes, floats, ints, uints, object
df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]})
df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100.0, 200, 255.3873]})
df["B"] = df["B"].astype("float64[pyarrow]")
gb = df.groupby("A")
result = gb.agg(lambda x: x)

expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]})
expected["B"] = expected["B"].astype("float64[pyarrow]")
expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100.0, 200, 255.3873]})
expected["B"] = expected["B"].astype("double[pyarrow]")
expected.set_index("A", inplace=True)

tm.assert_frame_equal(result, expected)
assert result["B"].dtype == expected["B"].dtype


def test_agg_lambda_complex128_pyarrow_dtype_conversion():
def test_agg_lambda_int64_pyarrow_dtype_conversion():
# test numpy dtype conversion back to pyarrow dtype
# complexes, floats, ints, uints, object
df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]})
df["B"] = df["B"].astype("int64[pyarrow]")
gb = df.groupby("A")
result = gb.agg(lambda x: complex(x.sum(), x.count()))
result = gb.agg(lambda x: x)

expected = DataFrame(
{
"A": ["c1", "c2", "c3"],
"B": [complex(100, 1), complex(200, 1), complex(255, 1)],
}
)
expected["B"] = expected["B"].astype("complex128")
expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]})
expected["B"] = expected["B"].astype("int64[pyarrow]")
expected.set_index("A", inplace=True)

tm.assert_frame_equal(result, expected)
assert result["B"].dtype == expected["B"].dtype


def test_agg_lambda_int64_pyarrow_dtype_conversion():
def test_agg_lambda_complex128_pyarrow_dtype_conversion():
df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]})
df["B"] = df["B"].astype("int64[pyarrow]")
gb = df.groupby("A")
result = gb.agg(lambda x: x)
result = gb.agg(lambda x: complex(x.sum(), x.count()))

expected = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]})
expected["B"] = expected["B"].astype("int64[pyarrow]")
expected = DataFrame(
{
"A": ["c1", "c2", "c3"],
"B": [complex(100, 1), complex(200, 1), complex(255, 1)],
}
)
expected["B"] = expected["B"].astype("complex128")
expected.set_index("A", inplace=True)

tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -1975,8 +1977,6 @@ def test_agg_lambda_bool_pyarrow_dtype_conversion():


def test_agg_lambda_object_pyarrow_dtype_conversion():
import pyarrow as pa

df = DataFrame({"A": ["c1", "c2", "c3"], "B": [100, 200, 255]})
df["B"] = df["B"].astype("int64[pyarrow]")
gb = df.groupby("A")
Expand All @@ -1986,9 +1986,7 @@ def test_agg_lambda_object_pyarrow_dtype_conversion():
{"A": ["c1", "c2", "c3"], "B": [{"number": 1}, {"number": 1}, {"number": 1}]}
)

pyarrow_type = pa.struct({"number": pa.int64()})
pandas_pyarrow_dtype = pd.ArrowDtype(pyarrow_type)
expected["B"] = expected["B"].astype(pandas_pyarrow_dtype)
expected["B"] = expected["B"].astype("object")
expected.set_index("A", inplace=True)

tm.assert_frame_equal(result, expected)
Expand Down
Loading