apache · zhengruifeng · Feb 26, 2026 · Feb 26, 2026
diff --git a/python/benchmarks/bench_arrow.py b/python/benchmarks/bench_arrow.py
@@ -51,3 +51,68 @@ def peakmem_int_to_pandas(self, n_rows, types_mapper):
 
     def peakmem_int_with_nulls_to_pandas(self, n_rows, types_mapper):
         self.int_array_with_nulls.to_pandas(types_mapper=self.types_mapper)
+
+
+class LongArrowToPandasBenchmark:
+    """Benchmark for Arrow long array -> Pandas conversions."""
+
+    params = [
+        [10000, 100000, 1000000],
+        ["simple", "arrow_types_mapper", "pd.Series"],
+    ]
+    param_names = ["n_rows", "method"]
+
+    def setup(self, n_rows, method):
+        self.long_array = pa.array(list(range(n_rows - 1)) + [9223372036854775707], type=pa.int64())
+
+    # check 3 different ways to convert non-nullable longs to numpy int64
+    def run_long_to_pandas(self, n_rows, method):
+        if method == "simple":
+            ser = self.long_array.to_pandas()
+        elif method == "arrow_types_mapper":
+            ser = self.long_array.to_pandas(types_mapper=pd.ArrowDtype).astype(np.int64)
+        else:
+            ser = pd.Series(self.long_array, dtype=np.int64)
+        assert ser.dtype == np.int64
+
+    def time_long_to_pandas(self, n_rows, method):
+        self.run_long_to_pandas(n_rows, method)
+
+    def peakmem_long_to_pandas(self, n_rows, method):
+        self.run_long_to_pandas(n_rows, method)
+
+
+class NullableLongArrowToPandasBenchmark:
+    """Benchmark for Arrow long array with nulls -> Pandas conversions."""
+
+    params = [
+        [10000, 100000, 1000000],
+        ["integer_object_nulls", "arrow_types_mapper", "pd.Series"],
+    ]
+    param_names = ["n_rows", "method"]
+
+    def setup(self, n_rows, method):
+        self.long_array_with_nulls = pa.array(
+            [i if i % 10 != 0 else None for i in range(n_rows - 1)] + [9223372036854775707],
+            type=pa.int64(),
+            )
+
+    # check 3 different ways to convert nullable longs to nullable extension type
+    def run_long_with_nulls_to_pandas_ext(self, n_rows, method):
+        if method == "integer_object_nulls":
+            ser = self.long_array_with_nulls.to_pandas(integer_object_nulls=True).astype(
+                pd.Int64Dtype()
+            )
+        elif method == "arrow_types_mapper":
+            ser = self.long_array_with_nulls.to_pandas(types_mapper=pd.ArrowDtype).astype(
+                pd.Int64Dtype()
+            )
+        else:
+            ser = pd.Series(self.long_array_with_nulls.to_pylist(), dtype=pd.Int64Dtype())
+        assert ser.dtype == pd.Int64Dtype()
+
+    def time_long_with_nulls_to_pandas_ext(self, n_rows, method):
+        self.run_long_with_nulls_to_pandas_ext(n_rows, method)
+
+    def peakmem_long_with_nulls_to_pandas_ext(self, n_rows, method):
+        self.run_long_with_nulls_to_pandas_ext(n_rows, method)
diff --git a/python/pyspark/sql/conversion.py b/python/pyspark/sql/conversion.py
@@ -1654,33 +1654,7 @@ def convert_numpy(
 
         series: pd.Series
 
-        # TODO(SPARK-55332): Create benchmark for pa.array -> pd.series integer conversion
-        # 1, benchmark a nullable integral array
-        # a = pa.array(list(range(10000000)) + [9223372036854775707, None], type=pa.int64())
-        # %timeit a.to_pandas(types_mapper=pd.ArrowDtype)
-        # 11.9 μs ± 407 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
-        # %timeit a.to_pandas(types_mapper=pd.ArrowDtype).astype(pd.Int64Dtype())
-        # 589 ms ± 9.35 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
-        # %timeit pd.Series(a.to_pylist(), dtype=pd.Int64Dtype())
-        # 2.94 s ± 19.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
-        # %timeit a.to_pandas(integer_object_nulls=True).astype(pd.Int64Dtype())
-        # 2.05 s ± 22.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
-        # pd.Series(a, dtype=pd.Int64Dtype())
-        # fails due to internal np.float64 coercion
-        # OverflowError: Python int too large to convert to C long
-        #
-        # 2, benchmark a nullable integral array
-        # b = pa.array(list(range(10000000)) + [9223372036854775707, 1], type=pa.int64())
-        # %timeit b.to_pandas(types_mapper=pd.ArrowDtype).astype(np.int64)
-        # 30.2 μs ± 831 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
-        # %timeit pd.Series(b.to_pandas(types_mapper=pd.ArrowDtype), dtype=np.int64)
-        # 33.3 μs ± 928 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
-        # %timeit pd.Series(b, dtype=np.int64) <- lose the name
-        # 11.9 μs ± 125 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
-        # %timeit b.to_pandas()
-        # 7.56 μs ± 96.5 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
-        # %timeit b.to_pandas().astype(np.int64) <- astype is non-trivial
-        # 19.1 μs ± 242 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
+        # conversion methods are selected based on benchmark python/benchmarks/bench_arrow.py
         if isinstance(spark_type, ByteType):
             if arr.null_count > 0:
                 series = arr.to_pandas(types_mapper=pd.ArrowDtype).astype(pd.Int8Dtype())