Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add count_matches/2, re_count_matches/2, re_scan/2 and re_named_captures/2 to Series #895

Merged
merged 3 commits into from
Apr 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions lib/explorer/backend/lazy_series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ defmodule Explorer.Backend.LazySeries do
split_into: 3,
json_decode: 2,
json_path_match: 2,
count_matches: 2,
re_count_matches: 2,
re_scan: 2,
re_named_captures: 2,
# Float round
round: 2,
floor: 1,
Expand Down Expand Up @@ -1151,6 +1155,43 @@ defmodule Explorer.Backend.LazySeries do
Backend.Series.new(data, {:u, 32})
end

@impl true
def count_matches(series, substring) do
data = new(:count_matches, [lazy_series!(series), substring], {:u, 32})

Backend.Series.new(data, {:u, 32})
end

@impl true
def re_count_matches(series, pattern) do
data = new(:re_count_matches, [lazy_series!(series), pattern], {:u, 32})

Backend.Series.new(data, {:u, 32})
end

@impl true
def re_scan(series, pattern) do
data = new(:re_scan, [lazy_series!(series), pattern], {:list, :string})

Backend.Series.new(data, {:list, :string})
end

@impl true
def re_named_captures(_series, _pattern) do
raise """
#{unsupported(:re_named_captures, 2)}

If you want to capture named groups from a column, you must do so outside of a query.
For example, instead of:

Explorer.DataFrame.mutate(df, new_column: re_named_captures(column, ~S/(a|b)/))

You must write:

Explorer.DataFrame.put(df, :new_column, Explorer.Series.re_named_captures(column, ~S/(a|b)/))
"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could probably make this work by storing the original backend as a field of the the LazySeries and then asking it to return the names. But we can do this in another PR if desired, no worries for now IMO.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know yet how to capture that information in the context of the lazy frame, but I will research a bit. For now I'm going to merge this. Thanks!

end

@remaining_non_lazy_operations [
at: 2,
at_every: 2,
Expand Down
4 changes: 4 additions & 0 deletions lib/explorer/backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -290,10 +290,14 @@ defmodule Explorer.Backend.Series do
@callback split_into(s, String.t(), list(String.t() | atom())) :: s
@callback json_decode(s, dtype()) :: s
@callback json_path_match(s, String.t()) :: s
@callback count_matches(s, String.t()) :: s

## String - Regular expression versions
@callback re_contains(s, String.t()) :: s
@callback re_replace(s, String.t(), String.t()) :: s
@callback re_count_matches(s, String.t()) :: s
@callback re_scan(s, String.t()) :: s
@callback re_named_captures(s, String.t()) :: s

# Date / DateTime

Expand Down
4 changes: 4 additions & 0 deletions lib/explorer/polars_backend/expression.ex
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ defmodule Explorer.PolarsBackend.Expression do
split_into: 3,
json_decode: 2,
json_path_match: 2,
count_matches: 2,
re_count_matches: 2,
re_scan: 2,

# Lists
join: 2,
Expand Down Expand Up @@ -168,6 +171,7 @@ defmodule Explorer.PolarsBackend.Expression do
concat: 1,
column: 1,
correlation: 4,
re_named_captures: 2,
covariance: 3
]

Expand Down
3 changes: 3 additions & 0 deletions lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,9 @@ defmodule Explorer.PolarsBackend.Native do
def s_coalesce(_s, _other), do: err()
def s_concat(_series_list), do: err()
def s_contains(_s, _pattern, _is_literal), do: err()
def s_count_matches(_s, _pattern, _is_literal), do: err()
def s_extract_all(_s, _pattern), do: err()
def s_extract_groups(_s, _pattern), do: err()
def s_cumulative_max(_s, _reverse), do: err()
def s_cumulative_min(_s, _reverse), do: err()
def s_cumulative_sum(_s, _reverse), do: err()
Expand Down
20 changes: 20 additions & 0 deletions lib/explorer/polars_backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -778,6 +778,26 @@ defmodule Explorer.PolarsBackend.Series do
def json_path_match(series, json_path),
do: Shared.apply_series(series, :s_json_path_match, [json_path])

@impl true
def count_matches(series, substring) do
Shared.apply_series(series, :s_count_matches, [substring, true])
end

@impl true
def re_count_matches(series, pattern) do
Shared.apply_series(series, :s_count_matches, [pattern, false])
end

@impl true
def re_scan(series, pattern) do
Shared.apply_series(series, :s_extract_all, [pattern])
end

@impl true
def re_named_captures(series, pattern) do
Shared.apply_series(series, :s_extract_groups, [pattern])
end

# Polars specific functions

def name(series), do: Shared.apply_series(series, :s_name)
Expand Down
149 changes: 149 additions & 0 deletions lib/explorer/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5794,6 +5794,155 @@ defmodule Explorer.Series do
def split_into(%Series{dtype: dtype}, by, [_ | _]) when is_binary(by),
do: dtype_error("split_into/3", dtype, [:string])

@doc """
Detects how many times a substring appears in a string.

> ### Notice {: .warning}
>
> This function detects only literal strings. For regular expressions, see `re_count_matches/2`.

## Examples

iex> s = Explorer.Series.from_list(["abc", "def", "bcd", nil])
iex> Explorer.Series.count_matches(s, "bc")
#Explorer.Series<
Polars[4]
u32 [1, 0, 1, nil]
>
"""
@doc type: :string_wise
@spec count_matches(Series.t(), String.t()) :: Series.t()
def count_matches(%Series{dtype: :string} = series, substring)
when K.is_binary(substring),
do: apply_series(series, :count_matches, [substring])

def count_matches(%Series{dtype: dtype}, _),
do: dtype_error("count_matches/2", dtype, [:string])

@doc """
Count how many times a pattern matches a string.

> ### Notice {: .warning}
>
> This function matches against a regular expression. It does not expect an Elixir regex,
> but a escaped string and you can use the `~S` sigil for escaping it. Since each Explorer
> backend may have its own regular expression rules, you must consult their underlying
> engine. For the default backend (Polars), the rules are outlined in the Rust create named
> [`regex`](https://docs.rs/regex/latest/regex/).
>
> To count matching literal strings, you can use `count_matches/2`.

## Examples

iex> s = Explorer.Series.from_list(["abc", "def def", "bcd", nil])
iex> Explorer.Series.re_count_matches(s, ~S/(a|e)/)
#Explorer.Series<
Polars[4]
u32 [1, 2, 0, nil]
>
"""
@doc type: :string_wise
@spec re_count_matches(Series.t(), String.t()) :: Series.t()
def re_count_matches(%Series{dtype: :string} = series, pattern)
when K.is_binary(pattern),
do: apply_series(series, :re_count_matches, [pattern])

def re_count_matches(%Series{dtype: :string}, %Regex{}) do
raise ArgumentError,
"standard regexes cannot be used as pattern because it may be incompatible with the backend. " <>
"Please use the `~S` sigil or extract the source from the regex with `Regex.source/1`"
end

def re_count_matches(%Series{dtype: dtype}, _),
do: dtype_error("re_count_matches/2", dtype, [:string])

@doc """
Scan for all matches for the given regex pattern.

Extract each successive non-overlapping regex match in an individual string as a list.

> ### Notice {: .warning}
>
> This function matches against a regular expression. It does not expect an Elixir regex,
> but a escaped string and you can use the `~S` sigil for escaping it. Since each Explorer
> backend may have its own regular expression rules, you must consult their underlying
> engine. For the default backend (Polars), the rules are outlined in the Rust create named
> [`regex`](https://docs.rs/regex/latest/regex/).

## Examples

iex> s = Explorer.Series.from_list(["abc", "def def", "bcd", nil])
iex> Explorer.Series.re_scan(s, ~S/(b|d)/)
#Explorer.Series<
Polars[4]
list[string] [["b"], ["d", "d"], ["b", "d"], nil]
>

"""
@doc type: :string_wise
@spec re_scan(Series.t(), String.t()) :: Series.t()
def re_scan(%Series{dtype: :string} = series, pattern)
when K.is_binary(pattern),
do: apply_series(series, :re_scan, [pattern])

def re_scan(%Series{dtype: :string}, %Regex{}) do
raise ArgumentError,
"standard regexes cannot be used as pattern because it may be incompatible with the backend. " <>
"Please use the `~S` sigil or extract the source from the regex with `Regex.source/1`"
end

def re_scan(%Series{dtype: dtype}, _),
do: dtype_error("re_scan/2", dtype, [:string])

@doc """
Extract all capture groups as a struct for the given regex pattern.

All group names are strings. If your pattern contains unnamed groups, their numerical position
is converted to a string, starting from "1".

> ### Notice {: .warning}
>
> This function does not work inside the context of `Explorer.Query`. If you are working on
> a data frame, you first need to extract the series from it. See `Explorer.DataFrame.pull/2`.
>
> This function matches against a regular expression. It does not expect an Elixir regex,
> but a escaped string and you can use the `~S` sigil for escaping it. Since each Explorer
> backend may have its own regular expression rules, you must consult their underlying
> engine. For the default backend (Polars), the rules are outlined in the Rust create named
> [`regex`](https://docs.rs/regex/latest/regex/).

## Examples

iex> s = Explorer.Series.from_list(["abc", "def def", "bcd", nil])
iex> Explorer.Series.re_named_captures(s, ~S/(b|d)/)
#Explorer.Series<
Polars[4]
struct[1] [%{"1" => "b"}, %{"1" => "d"}, %{"1" => "b"}, %{"1" => nil}]
>

iex> s = Explorer.Series.from_list(["alice@service.com", "bob@example.com"])
iex> Explorer.Series.re_named_captures(s, ~S/(?<account>[^@]+)@(?<host>.*)/)
#Explorer.Series<
Polars[2]
struct[2] [%{"account" => "alice", "host" => "service.com"}, %{"account" => "bob", "host" => "example.com"}]
>

"""
@doc type: :string_wise
@spec re_named_captures(Series.t(), String.t()) :: Series.t()
def re_named_captures(%Series{dtype: :string} = series, pattern)
when K.is_binary(pattern),
do: apply_series(series, :re_named_captures, [pattern])

def re_named_captures(%Series{dtype: :string}, %Regex{}) do
raise ArgumentError,
"standard regexes cannot be used as pattern because it may be incompatible with the backend. " <>
"Please use the `~S` sigil or extract the source from the regex with `Regex.source/1`"
end

def re_named_captures(%Series{dtype: dtype}, _),
do: dtype_error("re_named_captures/2", dtype, [:string])

# Float

@doc """
Expand Down
3 changes: 2 additions & 1 deletion native/explorer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ features = [
"range",
"rank",
"propagate_nans",
"extract_jsonpath"
"extract_jsonpath",
"extract_groups",
]

[dependencies.polars-ops]
Expand Down
18 changes: 18 additions & 0 deletions native/explorer/src/expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1154,3 +1154,21 @@ pub fn expr_over(left: ExExpr, groups: Vec<ExExpr>) -> ExExpr {
let expr = left.clone_inner().over(groups);
ExExpr::new(expr)
}

#[rustler::nif]
pub fn expr_count_matches(expr: ExExpr, pattern: &str) -> ExExpr {
let expr = expr.clone_inner();
ExExpr::new(expr.str().count_matches(pattern.lit(), true))
}

#[rustler::nif]
pub fn expr_re_count_matches(expr: ExExpr, pattern: &str) -> ExExpr {
let expr = expr.clone_inner();
ExExpr::new(expr.str().count_matches(pattern.lit(), false))
}

#[rustler::nif]
pub fn expr_re_scan(expr: ExExpr, pattern: &str) -> ExExpr {
let expr = expr.clone_inner();
ExExpr::new(expr.str().extract_all(pattern.lit()))
}
6 changes: 6 additions & 0 deletions native/explorer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,9 @@ rustler::init!(
expr_re_replace,
expr_json_path_match,
expr_split_into,
expr_count_matches,
expr_re_count_matches,
expr_re_scan,
// float round expressions
expr_round,
expr_floor,
Expand Down Expand Up @@ -327,6 +330,9 @@ rustler::init!(
s_coalesce,
s_concat,
s_contains,
s_count_matches,
s_extract_all,
s_extract_groups,
s_cos,
s_upcase,
s_day_of_week,
Expand Down
38 changes: 38 additions & 0 deletions native/explorer/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1857,3 +1857,41 @@ pub fn s_row_index(series: ExSeries) -> Result<ExSeries, ExplorerError> {
let s = Series::new("row_index", 0..len);
Ok(ExSeries::new(s))
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn s_count_matches(
s1: ExSeries,
pattern: &str,
literal: bool,
) -> Result<ExSeries, ExplorerError> {
let chunked_array = if literal {
s1.str()?.count_matches(pattern, true)?
} else {
s1.str()?.count_matches(pattern, false)?
};
Ok(ExSeries::new(chunked_array.into()))
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn s_extract_all(s1: ExSeries, pattern: &str) -> Result<ExSeries, ExplorerError> {
let chunked_array = s1.str()?.extract_all(pattern)?;
Ok(ExSeries::new(chunked_array.into()))
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn s_extract_groups(s1: ExSeries, pattern: &str) -> Result<ExSeries, ExplorerError> {
let s2 = s1
.clone_inner()
.into_frame()
.lazy()
.with_column(
col(s1.name())
.str()
.extract_groups(pattern)?
.alias(s1.name()),
)
.collect()?
.column(s1.name())?
.clone();
Ok(ExSeries::new(s2))
}
Loading
Loading