fix regex plugin to run inside duckdb

nickzoic · nickzoic · commit c5b93e64cdfa · 2025-03-12T14:03:24.000+11:00
diff --git a/countess/plugins/regex.py b/countess/plugins/regex.py
@@ -1,6 +1,7 @@
 import logging
-import re
-from typing import Any, Optional
+from typing import Optional
+
+from duckdb import DuckDBPyConnection, DuckDBPyRelation
 
 from countess import VERSION
 from countess.core.parameters import (
@@ -11,7 +12,8 @@
     MultiParam,
     StringParam,
 )
-from countess.core.plugins import DuckdbThreadedTransformPlugin
+from countess.core.plugins import DuckdbSimplePlugin
+from countess.utils.duckdb import duckdb_escape_identifier, duckdb_escape_literal
 
 logger = logging.getLogger(__name__)
 
@@ -21,7 +23,7 @@ class OutputColumnsMultiParam(MultiParam):
     datatype = DataTypeChoiceParam("Column Type", "STRING")
 
 
-class RegexToolPlugin(DuckdbThreadedTransformPlugin):
+class RegexToolPlugin(DuckdbSimplePlugin):
     name = "Regex Tool"
     description = "Apply regular expressions to a column to make new column(s)"
     link = "https://countess-project.github.io/CountESS/included-plugins/#regex-tool"
@@ -33,37 +35,34 @@ class RegexToolPlugin(DuckdbThreadedTransformPlugin):
     drop_column = BooleanParam("Drop Column", False)
     drop_unmatch = BooleanParam("Drop Unmatched Rows", False)
 
-    compiled_re = None
-
-    def prepare(self, *a) -> None:
-        super().prepare(*a)
-        self.compiled_re = re.compile(self.regex.value)
-
-    def add_fields(self):
-        return {op.name.value: op.datatype.get_selected_type() for op in self.output}
+    def execute(
+        self, ddbc: DuckDBPyConnection, source: DuckDBPyRelation, row_limit: Optional[int] = None
+    ) -> Optional[DuckDBPyRelation]:
+        column_id = duckdb_escape_identifier(self.column.value)
+        regexp_value = duckdb_escape_literal(self.regex.value)
+        output_ids = [duckdb_escape_literal(o.name.value) for o in self.output if o.name.value]
+        output_types = [
+            duckdb_escape_identifier(o.name.value) + " " + o.datatype.value for o in self.output if o.name.value
+        ]
 
-    def remove_fields(self, field_names):
         if self.drop_column:
-            return [self.column.value]
+            proj = "".join(duckdb_escape_identifier(c) + ", " for c in source.columns if c != self.column.value)
         else:
-            return []
+            proj = "*, "
 
-    def transform(self, data: dict[str, Any]) -> Optional[dict[str, Any]]:
-        assert self.compiled_re is not None
-        value = data[self.column.value]
-        if value is not None:
-            try:
-                if match := self.compiled_re.match(str(value)):
-                    data.update(
-                        {op.name.value: op.datatype.cast_value(val) for op, val in zip(self.output, match.groups())}
-                    )
-                    return data
-                else:
-                    logger.info("%s didn't match", repr(value))
-            except (TypeError, ValueError) as exc:
-                logger.warning("Exception", exc_info=exc)
+        proj += f"""
+            unnest(try_cast(
+                regexp_extract({column_id}, {regexp_value}, [{','.join(output_ids)}])
+                as struct({','.join(output_types)})
+            ))
+        """
+
+        logger.debug("VampseqScorePlugin proj %s", proj)
 
         if self.drop_unmatch:
-            return None
+            filt = f"regexp_matches({column_id}, {regexp_value})"
+            logger.debug("VampseqScorePlugin filt %s", filt)
+            return source.filter(filt).project(proj)
+
         else:
-            return data
+            return source.project(proj)
diff --git a/tests/plugins/test_regex.py b/tests/plugins/test_regex.py
@@ -1,18 +1,71 @@
+import duckdb
 import pandas as pd
 
 from countess.plugins.regex import RegexToolPlugin
 
+ddbc = duckdb.connect()
 
-def test_tool():
+ddbc.from_df(
+    pd.DataFrame(
+        [
+            {"stuff": "hello"},
+            {"stuff": "backwards"},
+            {"stuff": "noaardvark"},
+        ]
+    )
+).create("n_0")
+
+source = ddbc.table("n_0")
+
+
+def test_tool_1():
     plugin = RegexToolPlugin()
     plugin.set_parameter("regex", ".*?([a]+).*")
-    plugin.prepare("fake", None)
+    plugin.set_parameter("output.0.name", "foo")
+    plugin.prepare(ddbc, source)
     plugin.set_parameter("column", "stuff")
+
+    out = plugin.execute(ddbc, source)
+
+    assert len(out) == 3
+    assert out.columns == ["stuff", "foo"]
+    assert sorted(out.fetchall()) == [("backwards", "a"), ("hello", ""), ("noaardvark", "aa")]
+
+
+def test_tool_2():
+    plugin = RegexToolPlugin()
+    plugin.set_parameter("regex", ".*?([a]+).*")
     plugin.set_parameter("output.0.name", "foo")
+    plugin.set_parameter("drop_column", True)
+    plugin.prepare(ddbc, source)
+    plugin.set_parameter("column", "stuff")
+
+    out = plugin.execute(ddbc, source)
+
+    assert len(out) == 3
+    assert out.columns == ["foo"]
+    assert sorted(out.fetchall()) == [("",), ("a",), ("aa",)]
 
-    assert plugin.transform({"stuff": "hello"}) == {"stuff": "hello"}
-    assert plugin.transform({"stuff": "backwards"})["foo"] == "a"
-    assert plugin.transform({"stuff": "noaardvark"})["foo"] == "aa"
 
+def test_tool_3():
+    plugin = RegexToolPlugin()
+    plugin.set_parameter("regex", ".*?([a]+).*")
+    plugin.set_parameter("output.0.name", "foo")
     plugin.set_parameter("drop_unmatch", True)
-    assert plugin.transform({"stuff": "hello"}) is None
+    plugin.prepare(ddbc, source)
+    plugin.set_parameter("column", "stuff")
+
+    out = plugin.execute(ddbc, source)
+
+    assert len(out) == 2
+    assert out.columns == ["stuff", "foo"]
+    assert sorted(out.fetchall()) == [
+        (
+            "backwards",
+            "a",
+        ),
+        (
+            "noaardvark",
+            "aa",
+        ),
+    ]
diff --git a/tests/test_cmd.py b/tests/test_cmd.py
@@ -8,9 +8,9 @@
 from countess.core.cmd import configure_graphs, main, run
 
 expected_output = """"thing","result"
-"foo",23512.8
+"foo",23512.799
 "baz",12769
-"qux",14728.114285714286
+"qux",14728.114
 """