Merge branch 'main' into main_chelsealin_enablesqlglot2

chelsea-lin · web-flow · commit 6adb0e1144b5 · 2026-01-21T10:17:27.000-08:00
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -818,49 +818,30 @@ def _materialize_local(
             total_rows = result_batches.approx_total_rows
             # Remove downsampling config from subsequent invocations, as otherwise could result in many
             # iterations if downsampling undershoots
-            return self._downsample(
-                total_rows=total_rows,
-                sampling_method=sample_config.sampling_method,
-                fraction=fraction,
-                random_state=sample_config.random_state,
-            )._materialize_local(
-                MaterializationOptions(ordered=materialize_options.ordered)
-            )
-        else:
-            df = result_batches.to_pandas()
-            df = self._copy_index_to_pandas(df)
-            df.set_axis(self.column_labels, axis=1, copy=False)
-            return df, execute_result.query_job
-
-    def _downsample(
-        self, total_rows: int, sampling_method: str, fraction: float, random_state
-    ) -> Block:
-        # either selecting fraction or number of rows
-        if sampling_method == _HEAD:
-            filtered_block = self.slice(stop=int(total_rows * fraction))
-            return filtered_block
-        elif (sampling_method == _UNIFORM) and (random_state is None):
-            filtered_expr = self.expr._uniform_sampling(fraction)
-            block = Block(
-                filtered_expr,
-                index_columns=self.index_columns,
-                column_labels=self.column_labels,
-                index_labels=self.index.names,
-            )
-            return block
-        elif sampling_method == _UNIFORM:
-            block = self.split(
-                fracs=(fraction,),
-                random_state=random_state,
-                sort=False,
-            )[0]
-            return block
+            if sample_config.sampling_method == "head":
+                # Just truncates the result iterator without a follow-up query
+                raw_df = result_batches.to_pandas(limit=int(total_rows * fraction))
+            elif (
+                sample_config.sampling_method == "uniform"
+                and sample_config.random_state is None
+            ):
+                # Pushes sample into result without new query
+                sampled_batches = execute_result.batches(sample_rate=fraction)
+                raw_df = sampled_batches.to_pandas()
+            else:  # uniform sample with random state requires a full follow-up query
+                down_sampled_block = self.split(
+                    fracs=(fraction,),
+                    random_state=sample_config.random_state,
+                    sort=False,
+                )[0]
+                return down_sampled_block._materialize_local(
+                    MaterializationOptions(ordered=materialize_options.ordered)
+                )
         else:
-            # This part should never be called, just in case.
-            raise NotImplementedError(
-                f"The downsampling method {sampling_method} is not implemented, "
-                f"please choose from {','.join(_SAMPLING_METHODS)}."
-            )
+            raw_df = result_batches.to_pandas()
+        df = self._copy_index_to_pandas(raw_df)
+        df.set_axis(self.column_labels, axis=1, copy=False)
+        return df, execute_result.query_job
 
     def split(
         self,
diff --git a/bigframes/core/bq_data.py b/bigframes/core/bq_data.py
@@ -186,11 +186,22 @@ def get_arrow_batches(
     columns: Sequence[str],
     storage_read_client: bigquery_storage_v1.BigQueryReadClient,
     project_id: str,
+    sample_rate: Optional[float] = None,
 ) -> ReadResult:
     table_mod_options = {}
     read_options_dict: dict[str, Any] = {"selected_fields": list(columns)}
+
+    predicates = []
     if data.sql_predicate:
-        read_options_dict["row_restriction"] = data.sql_predicate
+        predicates.append(data.sql_predicate)
+    if sample_rate is not None:
+        assert isinstance(sample_rate, float)
+        predicates.append(f"RAND() < {sample_rate}")
+
+    if predicates:
+        full_predicates = " AND ".join(f"( {pred} )" for pred in predicates)
+        read_options_dict["row_restriction"] = full_predicates
+
     read_options = bq_storage_types.ReadSession.TableReadOptions(**read_options_dict)
 
     if data.at_time:
diff --git a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py
@@ -93,12 +93,19 @@ def _(expr: TypedExpr) -> sge.Expression:
 def _(expr: TypedExpr) -> sge.Expression:
     return sge.Case(
         ifs=[
+            # |x| < 1: The standard formula
+            sge.If(
+                this=sge.func("ABS", expr.expr) < sge.convert(1),
+                true=sge.func("ATANH", expr.expr),
+            ),
+            # |x| > 1: Returns NaN
             sge.If(
                 this=sge.func("ABS", expr.expr) > sge.convert(1),
                 true=constants._NAN,
-            )
+            ),
         ],
-        default=sge.func("ATANH", expr.expr),
+        # |x| = 1: Returns Infinity or -Infinity
+        default=sge.Mul(this=constants._INF, expression=expr.expr),
     )
 
 
@@ -145,15 +152,11 @@ def _(expr: TypedExpr) -> sge.Expression:
 
 @register_unary_op(ops.expm1_op)
 def _(expr: TypedExpr) -> sge.Expression:
-    return sge.Case(
-        ifs=[
-            sge.If(
-                this=expr.expr > constants._FLOAT64_EXP_BOUND,
-                true=constants._INF,
-            )
-        ],
-        default=sge.func("EXP", expr.expr),
-    ) - sge.convert(1)
+    return sge.If(
+        this=expr.expr > constants._FLOAT64_EXP_BOUND,
+        true=constants._INF,
+        false=sge.func("EXP", expr.expr) - sge.convert(1),
+    )
 
 
 @register_unary_op(ops.floor_op)
@@ -166,11 +169,22 @@ def _(expr: TypedExpr) -> sge.Expression:
     return sge.Case(
         ifs=[
             sge.If(
-                this=expr.expr <= sge.convert(0),
+                this=sge.Is(this=expr.expr, expression=sge.Null()),
+                true=sge.null(),
+            ),
+            # |x| > 0: The standard formula
+            sge.If(
+                this=expr.expr > sge.convert(0),
+                true=sge.Ln(this=expr.expr),
+            ),
+            # |x| < 0: Returns NaN
+            sge.If(
+                this=expr.expr < sge.convert(0),
                 true=constants._NAN,
-            )
+            ),
         ],
-        default=sge.Ln(this=expr.expr),
+        # |x| == 0: Returns -Infinity
+        default=constants._NEG_INF,
     )
 
 
@@ -179,11 +193,22 @@ def _(expr: TypedExpr) -> sge.Expression:
     return sge.Case(
         ifs=[
             sge.If(
-                this=expr.expr <= sge.convert(0),
+                this=sge.Is(this=expr.expr, expression=sge.Null()),
+                true=sge.null(),
+            ),
+            # |x| > 0: The standard formula
+            sge.If(
+                this=expr.expr > sge.convert(0),
+                true=sge.Log(this=sge.convert(10), expression=expr.expr),
+            ),
+            # |x| < 0: Returns NaN
+            sge.If(
+                this=expr.expr < sge.convert(0),
                 true=constants._NAN,
-            )
+            ),
         ],
-        default=sge.Log(this=expr.expr, expression=sge.convert(10)),
+        # |x| == 0: Returns -Infinity
+        default=constants._NEG_INF,
     )
 
 
@@ -192,11 +217,22 @@ def _(expr: TypedExpr) -> sge.Expression:
     return sge.Case(
         ifs=[
             sge.If(
-                this=expr.expr <= sge.convert(-1),
+                this=sge.Is(this=expr.expr, expression=sge.Null()),
+                true=sge.null(),
+            ),
+            # Domain: |x| > -1 (The standard formula)
+            sge.If(
+                this=expr.expr > sge.convert(-1),
+                true=sge.Ln(this=sge.convert(1) + expr.expr),
+            ),
+            # Out of Domain: |x| < -1 (Returns NaN)
+            sge.If(
+                this=expr.expr < sge.convert(-1),
                 true=constants._NAN,
-            )
+            ),
         ],
-        default=sge.Ln(this=sge.convert(1) + expr.expr),
+        # Boundary: |x| == -1 (Returns -Infinity)
+        default=constants._NEG_INF,
     )
 
 
@@ -608,7 +644,7 @@ def isfinite(arg: TypedExpr) -> sge.Expression:
     return sge.Not(
         this=sge.Or(
             this=sge.IsInf(this=arg.expr),
-            right=sge.IsNan(this=arg.expr),
+            expression=sge.IsNan(this=arg.expr),
         ),
     )
 
diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py
@@ -674,7 +674,7 @@ def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression:
             expressions=[_literal(value=v, dtype=value_type) for v in value]
         )
         return values if len(value) > 0 else _cast(values, sqlglot_type)
-    elif pd.isna(value):
+    elif pd.isna(value) or (isinstance(value, pa.Scalar) and not value.is_valid):
         return _cast(sge.Null(), sqlglot_type)
     elif dtype == dtypes.JSON_DTYPE:
         return sge.ParseJSON(this=sge.convert(str(value)))
diff --git a/bigframes/core/local_data.py b/bigframes/core/local_data.py
@@ -25,6 +25,7 @@
 import uuid
 
 import geopandas  # type: ignore
+import numpy
 import numpy as np
 import pandas as pd
 import pyarrow as pa
@@ -124,13 +125,21 @@ def to_arrow(
         geo_format: Literal["wkb", "wkt"] = "wkt",
         duration_type: Literal["int", "duration"] = "duration",
         json_type: Literal["string"] = "string",
+        sample_rate: Optional[float] = None,
         max_chunksize: Optional[int] = None,
     ) -> tuple[pa.Schema, Iterable[pa.RecordBatch]]:
         if geo_format != "wkt":
             raise NotImplementedError(f"geo format {geo_format} not yet implemented")
         assert json_type == "string"
 
-        batches = self.data.to_batches(max_chunksize=max_chunksize)
+        data = self.data
+
+        # This exists for symmetry with remote sources, but sampling local data like this shouldn't really happen
+        if sample_rate is not None:
+            to_take = numpy.random.rand(data.num_rows) < sample_rate
+            data = data.filter(to_take)
+
+        batches = data.to_batches(max_chunksize=max_chunksize)
         schema = self.data.schema
         if duration_type == "int":
             schema = _schema_durations_to_ints(schema)
diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py
@@ -88,7 +88,7 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]:
 
             yield batch
 
-    def to_arrow_table(self) -> pyarrow.Table:
+    def to_arrow_table(self, limit: Optional[int] = None) -> pyarrow.Table:
         # Need to provide schema if no result rows, as arrow can't infer
         # If ther are rows, it is safest to infer schema from batches.
         # Any discrepencies between predicted schema and actual schema will produce errors.
@@ -97,18 +97,21 @@ def to_arrow_table(self) -> pyarrow.Table:
         peek_value = list(peek_it)
         # TODO: Enforce our internal schema on the table for consistency
         if len(peek_value) > 0:
-            return pyarrow.Table.from_batches(
-                itertools.chain(peek_value, batches),  # reconstruct
-            )
+            batches = itertools.chain(peek_value, batches)  # reconstruct
+            if limit:
+                batches = pyarrow_utils.truncate_pyarrow_iterable(
+                    batches, max_results=limit
+                )
+            return pyarrow.Table.from_batches(batches)
         else:
             try:
                 return self._schema.to_pyarrow().empty_table()
             except pa.ArrowNotImplementedError:
                 # Bug with some pyarrow versions, empty_table only supports base storage types, not extension types.
                 return self._schema.to_pyarrow(use_storage_types=True).empty_table()
 
-    def to_pandas(self) -> pd.DataFrame:
-        return io_pandas.arrow_to_pandas(self.to_arrow_table(), self._schema)
+    def to_pandas(self, limit: Optional[int] = None) -> pd.DataFrame:
+        return io_pandas.arrow_to_pandas(self.to_arrow_table(limit=limit), self._schema)
 
     def to_pandas_batches(
         self, page_size: Optional[int] = None, max_results: Optional[int] = None
@@ -158,7 +161,7 @@ def schema(self) -> bigframes.core.schema.ArraySchema:
         ...
 
     @abc.abstractmethod
-    def batches(self) -> ResultsIterator:
+    def batches(self, sample_rate: Optional[float] = None) -> ResultsIterator:
         ...
 
     @property
@@ -200,9 +203,9 @@ def execution_metadata(self) -> ExecutionMetadata:
     def schema(self) -> bigframes.core.schema.ArraySchema:
         return self._data.schema
 
-    def batches(self) -> ResultsIterator:
+    def batches(self, sample_rate: Optional[float] = None) -> ResultsIterator:
         return ResultsIterator(
-            iter(self._data.to_arrow()[1]),
+            iter(self._data.to_arrow(sample_rate=sample_rate)[1]),
             self.schema,
             self._data.metadata.row_count,
             self._data.metadata.total_bytes,
@@ -226,7 +229,7 @@ def execution_metadata(self) -> ExecutionMetadata:
     def schema(self) -> bigframes.core.schema.ArraySchema:
         return self._schema
 
-    def batches(self) -> ResultsIterator:
+    def batches(self, sample_rate: Optional[float] = None) -> ResultsIterator:
         return ResultsIterator(iter([]), self.schema, 0, 0)
 
 
@@ -260,12 +263,13 @@ def schema(self) -> bigframes.core.schema.ArraySchema:
         source_ids = [selection[0] for selection in self._selected_fields]
         return self._data.schema.select(source_ids).rename(dict(self._selected_fields))
 
-    def batches(self) -> ResultsIterator:
+    def batches(self, sample_rate: Optional[float] = None) -> ResultsIterator:
         read_batches = bq_data.get_arrow_batches(
             self._data,
             [x[0] for x in self._selected_fields],
             self._storage_client,
             self._project_id,
+            sample_rate=sample_rate,
         )
         arrow_batches: Iterator[pa.RecordBatch] = map(
             functools.partial(
diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py
@@ -165,7 +165,7 @@ def execution_metadata(self) -> ExecutionMetadata:
         def schema(self) -> Any:
             return schema
 
-        def batches(self) -> ResultsIterator:
+        def batches(self, sample_rate=None) -> ResultsIterator:
             return ResultsIterator(
                 arrow_batches_val,
                 self.schema,
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -4524,7 +4524,7 @@ def test_df_kurt(scalars_dfs):
         "n_default",
     ],
 )
-def test_sample(scalars_dfs, frac, n, random_state):
+def test_df_to_pandas_sample(scalars_dfs, frac, n, random_state):
     scalars_df, _ = scalars_dfs
     df = scalars_df.sample(frac=frac, n=n, random_state=random_state)
     bf_result = df.to_pandas()
@@ -4535,15 +4535,15 @@ def test_sample(scalars_dfs, frac, n, random_state):
     assert bf_result.shape[1] == scalars_df.shape[1]
 
 
-def test_sample_determinism(penguins_df_default_index):
+def test_df_to_pandas_sample_determinism(penguins_df_default_index):
     df = penguins_df_default_index.sample(n=100, random_state=12345).head(15)
     bf_result = df.to_pandas()
     bf_result2 = df.to_pandas()
 
     pandas.testing.assert_frame_equal(bf_result, bf_result2)
 
 
-def test_sample_raises_value_error(scalars_dfs):
+def test_df_to_pandas_sample_raises_value_error(scalars_dfs):
     scalars_df, _ = scalars_dfs
     with pytest.raises(
         ValueError, match="Only one of 'n' or 'frac' parameter can be specified."
diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_arctanh/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_arctanh/out.sql
@@ -6,9 +6,11 @@ WITH `bfcte_0` AS (
   SELECT
     *,
     CASE
+      WHEN ABS(`float64_col`) < 1
+      THEN ATANH(`float64_col`)
       WHEN ABS(`float64_col`) > 1
       THEN CAST('NaN' AS FLOAT64)
-      ELSE ATANH(`float64_col`)
+      ELSE CAST('Infinity' AS FLOAT64) * `float64_col`
     END AS `bfcol_1`
   FROM `bfcte_0`
 )
diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_expm1/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_expm1/out.sql
diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_ln/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_ln/out.sql
diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log10/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log10/out.sql
diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log1p/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_log1p/out.sql

Original file line number	Diff line number	Diff line change
`@@ -674,7 +674,7 @@ def _literal(value: typing.Any, dtype: dtypes.Dtype) -> sge.Expression:`
`674`	`674`	`expressions=[_literal(value=v, dtype=value_type) for v in value]`
`675`	`675`	`)`
`676`	`676`	`return values if len(value) > 0 else _cast(values, sqlglot_type)`
`677`		`- elif pd.isna(value):`
	`677`	`+ elif pd.isna(value) or (isinstance(value, pa.Scalar) and not value.is_valid):`
`678`	`678`	`return _cast(sge.Null(), sqlglot_type)`
`679`	`679`	`elif dtype == dtypes.JSON_DTYPE:`
`680`	`680`	`return sge.ParseJSON(this=sge.convert(str(value)))`