Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Added escape_regex operation to the str namespace and as a global function #19257

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ raw-cpuid = "11"
rayon = "1.9"
recursive = "0.1"
regex = "1.9"
regex-syntax = "0.8.5"
reqwest = { version = "0.12", default-features = false }
ryu = "1.0.13"
serde = { version = "1.0.188", features = ["derive", "rc"] }
Expand Down
1 change: 1 addition & 0 deletions crates/polars-ops/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ rand = { workspace = true, optional = true, features = ["small_rng", "std"] }
rand_distr = { workspace = true, optional = true }
rayon = { workspace = true }
regex = { workspace = true }
regex-syntax = { workspace = true }
serde = { workspace = true, optional = true }
serde_json = { workspace = true, optional = true }
unicode-reverse = { workspace = true, optional = true }
Expand Down
21 changes: 21 additions & 0 deletions crates/polars-ops/src/chunked_array/strings/escape_regex.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
use polars_core::prelude::{StringChunked, StringChunkedBuilder};

#[inline]
pub fn escape_regex_str(s: &str) -> String {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added this function so pl.escape_regex and str.escape_regex will be coupled by same implementation.

regex_syntax::escape(s)
}

pub fn escape_regex(ca: &StringChunked) -> StringChunked {
let mut buffer = String::new();
let mut builder = StringChunkedBuilder::new(ca.name().clone(), ca.len());
for opt_s in ca.iter() {
if let Some(s) = opt_s {
buffer.clear();
regex_syntax::escape_into(s, &mut buffer);
builder.append_value(&buffer);
} else {
builder.append_null();
}
}
builder.finish()
}
5 changes: 4 additions & 1 deletion crates/polars-ops/src/chunked_array/strings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ mod case;
#[cfg(feature = "strings")]
mod concat;
#[cfg(feature = "strings")]
mod escape_regex;
#[cfg(feature = "strings")]
mod extract;
#[cfg(feature = "find_many")]
mod find_many;
Expand All @@ -20,12 +22,13 @@ mod split;
mod strip;
#[cfg(feature = "strings")]
mod substring;

#[cfg(all(not(feature = "nightly"), feature = "strings"))]
mod unicode_internals;

#[cfg(feature = "strings")]
pub use concat::*;
#[cfg(feature = "strings")]
pub use escape_regex::*;
#[cfg(feature = "find_many")]
pub use find_many::*;
#[cfg(feature = "extract_jsonpath")]
Expand Down
6 changes: 6 additions & 0 deletions crates/polars-ops/src/chunked_array/strings/namespace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,12 @@ pub trait StringNameSpaceImpl: AsString {

substring::tail(ca, n.i64()?)
}
#[cfg(feature = "strings")]
/// Escapes all regular expression meta characters in the string.
fn str_escape_regex(&self) -> StringChunked {
let ca = self.as_string();
escape_regex::escape_regex(ca)
}
}

impl StringNameSpaceImpl for StringChunked {}
14 changes: 14 additions & 0 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ pub enum StringFunction {
ascii_case_insensitive: bool,
overlapping: bool,
},
#[cfg(feature = "regex")]
EscapeRegex,
}

impl StringFunction {
Expand Down Expand Up @@ -197,6 +199,8 @@ impl StringFunction {
ReplaceMany { .. } => mapper.with_same_dtype(),
#[cfg(feature = "find_many")]
ExtractMany { .. } => mapper.with_dtype(DataType::List(Box::new(DataType::String))),
#[cfg(feature = "regex")]
EscapeRegex => mapper.with_same_dtype(),
}
}
}
Expand Down Expand Up @@ -285,6 +289,8 @@ impl Display for StringFunction {
ReplaceMany { .. } => "replace_many",
#[cfg(feature = "find_many")]
ExtractMany { .. } => "extract_many",
#[cfg(feature = "regex")]
EscapeRegex => "escape_regex",
};
write!(f, "str.{s}")
}
Expand Down Expand Up @@ -400,6 +406,8 @@ impl From<StringFunction> for SpecialEq<Arc<dyn ColumnsUdf>> {
} => {
map_as_slice!(extract_many, ascii_case_insensitive, overlapping)
},
#[cfg(feature = "regex")]
EscapeRegex => map!(escape_regex),
}
}
}
Expand Down Expand Up @@ -1023,3 +1031,9 @@ pub(super) fn json_path_match(s: &[Column]) -> PolarsResult<Column> {
let pat = s[1].str()?;
Ok(ca.json_path_match(pat)?.into_column())
}

#[cfg(feature = "regex")]
pub(super) fn escape_regex(s: &Column) -> PolarsResult<Column> {
let ca = s.str()?;
Ok(ca.str_escape_regex().into_column())
}
10 changes: 10 additions & 0 deletions crates/polars-plan/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -592,4 +592,14 @@ impl StringNameSpace {
None,
)
}

#[cfg(feature = "strings")]
pub fn escape_regex(self) -> Expr {
self.0.map_many_private(
FunctionExpr::StringExpr(StringFunction::EscapeRegex),
&[],
false,
None,
)
}
}
5 changes: 5 additions & 0 deletions crates/polars-python/src/expr/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -339,4 +339,9 @@ impl PyExpr {
.extract_many(patterns.inner, ascii_case_insensitive, overlapping)
.into()
}

#[cfg(feature = "regex")]
fn str_escape_regex(&self) -> Self {
self.inner.clone().str().escape_regex().into()
}
}
2 changes: 2 additions & 0 deletions crates/polars-python/src/functions/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ mod misc;
mod random;
mod range;
mod string_cache;
mod strings;
mod whenthen;

pub use aggregation::*;
Expand All @@ -20,4 +21,5 @@ pub use misc::*;
pub use random::*;
pub use range::*;
pub use string_cache::*;
pub use strings::*;
pub use whenthen::*;
7 changes: 7 additions & 0 deletions crates/polars-python/src/functions/strings.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
use pyo3::prelude::*;

#[pyfunction]
pub fn escape_regex(s: &str) -> PyResult<String> {
let escaped_s = polars_ops::chunked_array::strings::escape_regex_str(s);
Ok(escaped_s)
}
4 changes: 4 additions & 0 deletions crates/polars-python/src/lazyframe/visitor/expr_nodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ pub enum PyStringFunction {
ZFill,
ContainsMany,
ReplaceMany,
EscapeRegex,
}

#[pymethods]
Expand Down Expand Up @@ -953,6 +954,9 @@ pub(crate) fn into_py(py: Python<'_>, expr: &AExpr) -> PyResult<PyObject> {
StringFunction::ExtractMany { .. } => {
return Err(PyNotImplementedError::new_err("extract_many"))
},
StringFunction::EscapeRegex => {
(PyStringFunction::EscapeRegex.into_py(py),).to_object(py)
},
},
FunctionExpr::StructExpr(_) => {
return Err(PyNotImplementedError::new_err("struct expr"))
Expand Down
2 changes: 2 additions & 0 deletions py-polars/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
datetime_ranges,
duration,
element,
escape_regex,
exclude,
field,
first,
Expand Down Expand Up @@ -303,6 +304,7 @@
"time_range",
"time_ranges",
"zeros",
"escape_regex",
# polars.functions.aggregation
"all",
"all_horizontal",
Expand Down
22 changes: 22 additions & 0 deletions py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -2781,6 +2781,28 @@ def concat(
delimiter = "-"
return self.join(delimiter, ignore_nulls=ignore_nulls)

def escape_regex(self) -> Expr:
r"""
Returns string values with all regular expression meta characters escaped.

Examples
--------
>>> df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})
>>> df.with_columns(pl.col("text").str.escape_regex().alias("escaped"))
shape: (4, 2)
┌──────────┬──────────────┐
│ text ┆ escaped │
│ --- ┆ --- │
│ str ┆ str │
╞══════════╪══════════════╡
│ abc ┆ abc │
│ def ┆ def │
│ null ┆ null │
│ abc(\w+) ┆ abc\(\\w\+\) │
└──────────┴──────────────┘
"""
return wrap_expr(self._pyexpr.str_escape_regex())


def _validate_format_argument(format: str | None) -> None:
if format is not None and ".%f" in format:
Expand Down
3 changes: 3 additions & 0 deletions py-polars/polars/functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from polars.functions.business import business_day_count
from polars.functions.col import col
from polars.functions.eager import align_frames, concat
from polars.functions.escape_regex import escape_regex
from polars.functions.lazy import (
approx_n_unique,
arctan2,
Expand Down Expand Up @@ -170,4 +171,6 @@
# polars.functions.whenthen
"when",
"sql_expr",
# polars.functions.escape_regex
"escape_regex",
]
27 changes: 27 additions & 0 deletions py-polars/polars/functions/escape_regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from __future__ import annotations

import contextlib

with contextlib.suppress(ImportError): # Module not available when building docs
import polars.polars as plr
import polars._reexport as pl


def escape_regex(s: str) -> str:
r"""
Escapes string regex meta characters.

Parameters
----------
s
The string that all of its meta characters will be escaped.

"""
if isinstance(s, pl.Expr):
msg = "escape_regex function is unsupported for `Expr`, you may want use `Expr.str.escape_regex` instead"
raise TypeError(msg)
elif not isinstance(s, str):
msg = f"escape_regex function supports only `str` type, got `{type(s)}`"
raise TypeError(msg)

return plr.escape_regex(s)
4 changes: 4 additions & 0 deletions py-polars/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,10 @@ fn polars(py: Python, m: &Bound<PyModule>) -> PyResult<()> {
m.add_wrapped(wrap_pyfunction!(functions::set_random_seed))
.unwrap();

// Functions - escape_regex
m.add_wrapped(wrap_pyfunction!(functions::escape_regex))
.unwrap();

// Exceptions - Errors
m.add(
"PolarsError",
Expand Down
19 changes: 19 additions & 0 deletions py-polars/tests/unit/functions/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,3 +538,22 @@ def test_head_tail(fruits_cars: pl.DataFrame) -> None:
res_expr = fruits_cars.select(pl.tail("A", 2))
expected = pl.Series("A", [4, 5])
assert_series_equal(res_expr.to_series(), expected)


def test_escape_regex() -> None:
result = pl.escape_regex("abc(\\w+)")
expected = "abc\\(\\\\w\\+\\)"
assert result == expected

df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})
with pytest.raises(
TypeError,
match="escape_regex function is unsupported for `Expr`, you may want use `Expr.str.escape_regex` instead",
):
df.with_columns(escaped=pl.escape_regex(pl.col("text"))) # type: ignore[arg-type]

with pytest.raises(
TypeError,
match="escape_regex function supports only `str` type, got `<class 'int'>`",
):
pl.escape_regex(3) # type: ignore[arg-type]
Original file line number Diff line number Diff line change
Expand Up @@ -1727,3 +1727,16 @@ def test_extract_many() -> None:
assert df.select(pl.col("values").str.extract_many("patterns")).to_dict(
as_series=False
) == {"values": [["disco"], ["rhap", "ody"]]}


def test_escape_regex() -> None:
df = pl.DataFrame({"text": ["abc", "def", None, "abc(\\w+)"]})
result_df = df.with_columns(pl.col("text").str.escape_regex().alias("escaped"))
expected_df = pl.DataFrame(
{
"text": ["abc", "def", None, "abc(\\w+)"],
"escaped": ["abc", "def", None, "abc\\(\\\\w\\+\\)"],
}
)

assert_frame_equal(result_df, expected_df)