Skip to content

Commit afc0d5d

Browse files
Support cudf-polars str.zfill (#19081)
Closes #19035 Closes #16480 I believe this needs pola-rs/polars#22985 to pass the one remaining failing test and the column overload described here #19035 (comment) Authors: - https://github.com/brandon-b-miller - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: #19081
1 parent d64233b commit afc0d5d

File tree

3 files changed

+171
-0
lines changed

3 files changed

+171
-0
lines changed

python/cudf_polars/cudf_polars/dsl/expressions/string.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from typing import TYPE_CHECKING, Any, ClassVar
1212

1313
from polars.exceptions import InvalidOperationError
14+
from polars.polars import dtype_str_repr
1415

1516
import pylibcudf as plc
1617

@@ -137,6 +138,7 @@ def from_polars(cls, obj: pl_expr.StringFunction) -> Self:
137138
Name.Reverse,
138139
Name.Tail,
139140
Name.Titlecase,
141+
Name.ZFill,
140142
}
141143
__slots__ = ("_regex_program", "name", "options")
142144
_non_child = ("dtype", "name", "options")
@@ -264,6 +266,17 @@ def _validate_input(self) -> None:
264266
raise NotImplementedError(
265267
"strip operations only support scalar patterns"
266268
)
269+
elif self.name is StringFunction.Name.ZFill:
270+
if isinstance(self.children[1], Literal):
271+
_, width = self.children
272+
assert isinstance(width, Literal)
273+
if width.value is not None and width.value < 0:
274+
dtypestr = dtype_str_repr(width.dtype.polars)
275+
raise InvalidOperationError(
276+
f"conversion from `{dtypestr}` to `u64` "
277+
f"failed in column 'literal' for 1 out of "
278+
f"1 values: [{width.value}]"
279+
) from None
267280

268281
@staticmethod
269282
def _create_regex_program(
@@ -322,6 +335,63 @@ def do_evaluate(
322335
),
323336
dtype=self.dtype,
324337
)
338+
elif self.name is StringFunction.Name.ZFill:
339+
# TODO: expensive validation
340+
# polars pads based on bytes, libcudf by visual width
341+
# only pass chars if the visual width matches the byte length
342+
column = self.children[0].evaluate(df, context=context)
343+
col_len_bytes = plc.strings.attributes.count_bytes(column.obj)
344+
col_len_chars = plc.strings.attributes.count_characters(column.obj)
345+
equal = plc.binaryop.binary_operation(
346+
col_len_bytes,
347+
col_len_chars,
348+
plc.binaryop.BinaryOperator.NULL_EQUALS,
349+
plc.DataType(plc.TypeId.BOOL8),
350+
)
351+
if not plc.reduce.reduce(
352+
equal,
353+
plc.aggregation.all(),
354+
plc.DataType(plc.TypeId.BOOL8),
355+
).to_py():
356+
raise InvalidOperationError(
357+
"zfill only supports ascii strings with no unicode characters"
358+
)
359+
if isinstance(self.children[1], Literal):
360+
width = self.children[1]
361+
assert isinstance(width, Literal)
362+
if width.value is None:
363+
return Column(
364+
plc.Column.from_scalar(
365+
plc.Scalar.from_py(None, self.dtype.plc),
366+
column.size,
367+
),
368+
self.dtype,
369+
)
370+
return Column(
371+
plc.strings.padding.zfill(column.obj, width.value), self.dtype
372+
)
373+
else:
374+
col_width = self.children[1].evaluate(df, context=context)
375+
assert isinstance(col_width, Column)
376+
all_gt_0 = plc.binaryop.binary_operation(
377+
col_width.obj,
378+
plc.Scalar.from_py(0, plc.DataType(plc.TypeId.INT64)),
379+
plc.binaryop.BinaryOperator.GREATER_EQUAL,
380+
plc.DataType(plc.TypeId.BOOL8),
381+
)
382+
383+
if not plc.reduce.reduce(
384+
all_gt_0,
385+
plc.aggregation.all(),
386+
plc.DataType(plc.TypeId.BOOL8),
387+
).to_py():
388+
raise InvalidOperationError("fill conversion failed.")
389+
390+
return Column(
391+
plc.strings.padding.zfill_by_widths(column.obj, col_width.obj),
392+
self.dtype,
393+
)
394+
325395
elif self.name is StringFunction.Name.Contains:
326396
child, arg = self.children
327397
column = child.evaluate(df, context=context)

python/cudf_polars/cudf_polars/testing/plugin.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ def pytest_configure(config: pytest.Config) -> None:
144144
"tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
145145
"tests/unit/operations/test_group_by.py::test_group_by_lit_series": "Incorrect broadcasting of literals in groupby-agg",
146146
"tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
147+
"tests/unit/operations/namespaces/string/test_pad.py::test_str_zfill_unicode_not_respected": "polars doesn't add zeros for unicode characters.",
147148
"tests/unit/operations/test_rolling.py::test_rolling_group_by_empty_groups_by_take_6330": "Ordering difference, might be polars bug",
148149
"tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU",
149150
"tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU",

python/cudf_polars/tests/expressions/test_stringfunction.py

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,106 @@ def test_string_join(ldf, ignore_nulls, delimiter):
504504
assert_gpu_result_equal(q)
505505

506506

507+
@pytest.mark.parametrize(
508+
"fill",
509+
[
510+
0,
511+
1,
512+
2,
513+
5,
514+
999,
515+
-1,
516+
None,
517+
],
518+
)
519+
@pytest.mark.parametrize(
520+
"input_strings",
521+
[
522+
["1", "0"],
523+
["123", "45"],
524+
["", "0"],
525+
["abc", "def"],
526+
],
527+
)
528+
def test_string_zfill(fill, input_strings):
529+
ldf = pl.LazyFrame({"a": input_strings})
530+
q = ldf.select(pl.col("a").str.zfill(fill))
531+
532+
if fill is not None and fill < 0:
533+
assert_collect_raises(
534+
q,
535+
polars_except=pl.exceptions.InvalidOperationError,
536+
cudf_except=pl.exceptions.ComputeError,
537+
)
538+
else:
539+
assert_gpu_result_equal(q)
540+
541+
542+
@pytest.mark.parametrize(
543+
"fill",
544+
[
545+
5
546+
if not POLARS_VERSION_LT_130
547+
else pytest.param(5, marks=pytest.mark.xfail(reason="fixed in Polars 1.30")),
548+
999
549+
if not POLARS_VERSION_LT_130
550+
else pytest.param(999, marks=pytest.mark.xfail(reason="fixed in Polars 1.30")),
551+
],
552+
)
553+
def test_string_zfill_pl_129(fill):
554+
ldf = pl.LazyFrame({"a": ["-1", "+2"]})
555+
q = ldf.select(pl.col("a").str.zfill(fill))
556+
assert_gpu_result_equal(q)
557+
558+
559+
@pytest.mark.parametrize(
560+
"fill",
561+
[
562+
0,
563+
1,
564+
2,
565+
5
566+
if not POLARS_VERSION_LT_130
567+
else pytest.param(5, marks=pytest.mark.xfail(reason="fixed in Polars 1.30")),
568+
999
569+
if not POLARS_VERSION_LT_130
570+
else pytest.param(999, marks=pytest.mark.xfail(reason="fixed in Polars 1.30")),
571+
-1,
572+
pytest.param(None, marks=pytest.mark.xfail(reason="None dtype")),
573+
],
574+
)
575+
def test_string_zfill_column(fill):
576+
ldf = pl.DataFrame(
577+
{
578+
"input_strings": ["1", "0", "123", "45", "", "0", "-1", "+2", "abc", "def"],
579+
"fill": [fill] * 10,
580+
}
581+
).lazy()
582+
q = ldf.select(pl.col("input_strings").str.zfill(pl.col("fill")))
583+
if fill is not None and fill < 0:
584+
assert_collect_raises(
585+
q,
586+
polars_except=pl.exceptions.InvalidOperationError,
587+
cudf_except=pl.exceptions.InvalidOperationError
588+
if not POLARS_VERSION_LT_130
589+
else pl.exceptions.ComputeError,
590+
)
591+
else:
592+
assert_gpu_result_equal(q)
593+
594+
595+
def test_string_zfill_forbidden_chars():
596+
ldf = pl.LazyFrame({"a": ["Café", "345", "東京", None]})
597+
q = ldf.select(pl.col("a").str.zfill(3))
598+
assert_collect_raises(
599+
q,
600+
polars_except=(),
601+
cudf_except=pl.exceptions.InvalidOperationError
602+
if not POLARS_VERSION_LT_130
603+
else pl.exceptions.ComputeError,
604+
)
605+
606+
507607
@pytest.mark.parametrize(
508608
"width",
509609
[

0 commit comments

Comments
 (0)