Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions python/benchmarks/bench_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,68 @@ def peakmem_int_to_pandas(self, n_rows, types_mapper):

def peakmem_int_with_nulls_to_pandas(self, n_rows, types_mapper):
self.int_array_with_nulls.to_pandas(types_mapper=self.types_mapper)


class LongArrowToPandasBenchmark:
"""Benchmark for Arrow long array -> Pandas conversions."""

params = [
[10000, 100000, 1000000],
["simple", "arrow_types_mapper", "pd.Series"],
]
param_names = ["n_rows", "method"]

def setup(self, n_rows, method):
self.long_array = pa.array(list(range(n_rows - 1)) + [9223372036854775707], type=pa.int64())

# check 3 different ways to convert non-nullable longs to numpy int64
def run_long_to_pandas(self, n_rows, method):
if method == "simple":
ser = self.long_array.to_pandas()
elif method == "arrow_types_mapper":
ser = self.long_array.to_pandas(types_mapper=pd.ArrowDtype).astype(np.int64)
else:
ser = pd.Series(self.long_array, dtype=np.int64)
assert ser.dtype == np.int64

def time_long_to_pandas(self, n_rows, method):
self.run_long_to_pandas(n_rows, method)

def peakmem_long_to_pandas(self, n_rows, method):
self.run_long_to_pandas(n_rows, method)


class NullableLongArrowToPandasBenchmark:
"""Benchmark for Arrow long array with nulls -> Pandas conversions."""

params = [
[10000, 100000, 1000000],
["integer_object_nulls", "arrow_types_mapper", "pd.Series"],
]
param_names = ["n_rows", "method"]

def setup(self, n_rows, method):
self.long_array_with_nulls = pa.array(
[i if i % 10 != 0 else None for i in range(n_rows - 1)] + [9223372036854775707],
type=pa.int64(),
)

# check 3 different ways to convert nullable longs to nullable extension type
def run_long_with_nulls_to_pandas_ext(self, n_rows, method):
if method == "integer_object_nulls":
ser = self.long_array_with_nulls.to_pandas(integer_object_nulls=True).astype(
pd.Int64Dtype()
)
elif method == "arrow_types_mapper":
ser = self.long_array_with_nulls.to_pandas(types_mapper=pd.ArrowDtype).astype(
pd.Int64Dtype()
)
else:
ser = pd.Series(self.long_array_with_nulls.to_pylist(), dtype=pd.Int64Dtype())
assert ser.dtype == pd.Int64Dtype()

def time_long_with_nulls_to_pandas_ext(self, n_rows, method):
self.run_long_with_nulls_to_pandas_ext(n_rows, method)

def peakmem_long_with_nulls_to_pandas_ext(self, n_rows, method):
self.run_long_with_nulls_to_pandas_ext(n_rows, method)
28 changes: 1 addition & 27 deletions python/pyspark/sql/conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -1654,33 +1654,7 @@ def convert_numpy(

series: pd.Series

# TODO(SPARK-55332): Create benchmark for pa.array -> pd.series integer conversion
# 1, benchmark a nullable integral array
# a = pa.array(list(range(10000000)) + [9223372036854775707, None], type=pa.int64())
# %timeit a.to_pandas(types_mapper=pd.ArrowDtype)
# 11.9 μs ± 407 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
# %timeit a.to_pandas(types_mapper=pd.ArrowDtype).astype(pd.Int64Dtype())
# 589 ms ± 9.35 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# %timeit pd.Series(a.to_pylist(), dtype=pd.Int64Dtype())
# 2.94 s ± 19.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# %timeit a.to_pandas(integer_object_nulls=True).astype(pd.Int64Dtype())
# 2.05 s ± 22.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# pd.Series(a, dtype=pd.Int64Dtype())
# fails due to internal np.float64 coercion
# OverflowError: Python int too large to convert to C long
#
# 2, benchmark a nullable integral array
# b = pa.array(list(range(10000000)) + [9223372036854775707, 1], type=pa.int64())
# %timeit b.to_pandas(types_mapper=pd.ArrowDtype).astype(np.int64)
# 30.2 μs ± 831 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
# %timeit pd.Series(b.to_pandas(types_mapper=pd.ArrowDtype), dtype=np.int64)
# 33.3 μs ± 928 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
# %timeit pd.Series(b, dtype=np.int64) <- lose the name
# 11.9 μs ± 125 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
# %timeit b.to_pandas()
# 7.56 μs ± 96.5 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
# %timeit b.to_pandas().astype(np.int64) <- astype is non-trivial
# 19.1 μs ± 242 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
# conversion methods are selected based on benchmark python/benchmarks/bench_arrow.py
if isinstance(spark_type, ByteType):
if arr.null_count > 0:
series = arr.to_pandas(types_mapper=pd.ArrowDtype).astype(pd.Int8Dtype())
Expand Down