Skip to content

Commit c5b93e6

Browse files
committed
fix regex plugin to run inside duckdb
1 parent f5ebaaa commit c5b93e6

File tree

3 files changed

+91
-39
lines changed

3 files changed

+91
-39
lines changed

countess/plugins/regex.py

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
2-
import re
3-
from typing import Any, Optional
2+
from typing import Optional
3+
4+
from duckdb import DuckDBPyConnection, DuckDBPyRelation
45

56
from countess import VERSION
67
from countess.core.parameters import (
@@ -11,7 +12,8 @@
1112
MultiParam,
1213
StringParam,
1314
)
14-
from countess.core.plugins import DuckdbThreadedTransformPlugin
15+
from countess.core.plugins import DuckdbSimplePlugin
16+
from countess.utils.duckdb import duckdb_escape_identifier, duckdb_escape_literal
1517

1618
logger = logging.getLogger(__name__)
1719

@@ -21,7 +23,7 @@ class OutputColumnsMultiParam(MultiParam):
2123
datatype = DataTypeChoiceParam("Column Type", "STRING")
2224

2325

24-
class RegexToolPlugin(DuckdbThreadedTransformPlugin):
26+
class RegexToolPlugin(DuckdbSimplePlugin):
2527
name = "Regex Tool"
2628
description = "Apply regular expressions to a column to make new column(s)"
2729
link = "https://countess-project.github.io/CountESS/included-plugins/#regex-tool"
@@ -33,37 +35,34 @@ class RegexToolPlugin(DuckdbThreadedTransformPlugin):
3335
drop_column = BooleanParam("Drop Column", False)
3436
drop_unmatch = BooleanParam("Drop Unmatched Rows", False)
3537

36-
compiled_re = None
37-
38-
def prepare(self, *a) -> None:
39-
super().prepare(*a)
40-
self.compiled_re = re.compile(self.regex.value)
41-
42-
def add_fields(self):
43-
return {op.name.value: op.datatype.get_selected_type() for op in self.output}
38+
def execute(
39+
self, ddbc: DuckDBPyConnection, source: DuckDBPyRelation, row_limit: Optional[int] = None
40+
) -> Optional[DuckDBPyRelation]:
41+
column_id = duckdb_escape_identifier(self.column.value)
42+
regexp_value = duckdb_escape_literal(self.regex.value)
43+
output_ids = [duckdb_escape_literal(o.name.value) for o in self.output if o.name.value]
44+
output_types = [
45+
duckdb_escape_identifier(o.name.value) + " " + o.datatype.value for o in self.output if o.name.value
46+
]
4447

45-
def remove_fields(self, field_names):
4648
if self.drop_column:
47-
return [self.column.value]
49+
proj = "".join(duckdb_escape_identifier(c) + ", " for c in source.columns if c != self.column.value)
4850
else:
49-
return []
51+
proj = "*, "
5052

51-
def transform(self, data: dict[str, Any]) -> Optional[dict[str, Any]]:
52-
assert self.compiled_re is not None
53-
value = data[self.column.value]
54-
if value is not None:
55-
try:
56-
if match := self.compiled_re.match(str(value)):
57-
data.update(
58-
{op.name.value: op.datatype.cast_value(val) for op, val in zip(self.output, match.groups())}
59-
)
60-
return data
61-
else:
62-
logger.info("%s didn't match", repr(value))
63-
except (TypeError, ValueError) as exc:
64-
logger.warning("Exception", exc_info=exc)
53+
proj += f"""
54+
unnest(try_cast(
55+
regexp_extract({column_id}, {regexp_value}, [{','.join(output_ids)}])
56+
as struct({','.join(output_types)})
57+
))
58+
"""
59+
60+
logger.debug("VampseqScorePlugin proj %s", proj)
6561

6662
if self.drop_unmatch:
67-
return None
63+
filt = f"regexp_matches({column_id}, {regexp_value})"
64+
logger.debug("VampseqScorePlugin filt %s", filt)
65+
return source.filter(filt).project(proj)
66+
6867
else:
69-
return data
68+
return source.project(proj)

tests/plugins/test_regex.py

Lines changed: 59 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,71 @@
1+
import duckdb
12
import pandas as pd
23

34
from countess.plugins.regex import RegexToolPlugin
45

6+
ddbc = duckdb.connect()
57

6-
def test_tool():
8+
ddbc.from_df(
9+
pd.DataFrame(
10+
[
11+
{"stuff": "hello"},
12+
{"stuff": "backwards"},
13+
{"stuff": "noaardvark"},
14+
]
15+
)
16+
).create("n_0")
17+
18+
source = ddbc.table("n_0")
19+
20+
21+
def test_tool_1():
722
plugin = RegexToolPlugin()
823
plugin.set_parameter("regex", ".*?([a]+).*")
9-
plugin.prepare("fake", None)
24+
plugin.set_parameter("output.0.name", "foo")
25+
plugin.prepare(ddbc, source)
1026
plugin.set_parameter("column", "stuff")
27+
28+
out = plugin.execute(ddbc, source)
29+
30+
assert len(out) == 3
31+
assert out.columns == ["stuff", "foo"]
32+
assert sorted(out.fetchall()) == [("backwards", "a"), ("hello", ""), ("noaardvark", "aa")]
33+
34+
35+
def test_tool_2():
36+
plugin = RegexToolPlugin()
37+
plugin.set_parameter("regex", ".*?([a]+).*")
1138
plugin.set_parameter("output.0.name", "foo")
39+
plugin.set_parameter("drop_column", True)
40+
plugin.prepare(ddbc, source)
41+
plugin.set_parameter("column", "stuff")
42+
43+
out = plugin.execute(ddbc, source)
44+
45+
assert len(out) == 3
46+
assert out.columns == ["foo"]
47+
assert sorted(out.fetchall()) == [("",), ("a",), ("aa",)]
1248

13-
assert plugin.transform({"stuff": "hello"}) == {"stuff": "hello"}
14-
assert plugin.transform({"stuff": "backwards"})["foo"] == "a"
15-
assert plugin.transform({"stuff": "noaardvark"})["foo"] == "aa"
1649

50+
def test_tool_3():
51+
plugin = RegexToolPlugin()
52+
plugin.set_parameter("regex", ".*?([a]+).*")
53+
plugin.set_parameter("output.0.name", "foo")
1754
plugin.set_parameter("drop_unmatch", True)
18-
assert plugin.transform({"stuff": "hello"}) is None
55+
plugin.prepare(ddbc, source)
56+
plugin.set_parameter("column", "stuff")
57+
58+
out = plugin.execute(ddbc, source)
59+
60+
assert len(out) == 2
61+
assert out.columns == ["stuff", "foo"]
62+
assert sorted(out.fetchall()) == [
63+
(
64+
"backwards",
65+
"a",
66+
),
67+
(
68+
"noaardvark",
69+
"aa",
70+
),
71+
]

tests/test_cmd.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
from countess.core.cmd import configure_graphs, main, run
99

1010
expected_output = """"thing","result"
11-
"foo",23512.8
11+
"foo",23512.799
1212
"baz",12769
13-
"qux",14728.114285714286
13+
"qux",14728.114
1414
"""
1515

1616

0 commit comments

Comments
 (0)