CountESS-Project
diff --git a/‎countess/core/parameters.py‎
Lines changed: 4 additions & 4 deletions b/‎countess/core/parameters.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎countess/core/pipeline.py‎
Lines changed: 2 additions & 4 deletions b/‎countess/core/pipeline.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎countess/core/plugins.py‎
Lines changed: 26 additions & 17 deletions b/‎countess/core/plugins.py‎
Lines changed: 26 additions & 17 deletions
diff --git a/‎countess/gui/tabular.py‎
Lines changed: 17 additions & 6 deletions b/‎countess/gui/tabular.py‎
Lines changed: 17 additions & 6 deletions
diff --git a/‎countess/plugins/csv.py‎
Lines changed: 42 additions & 125 deletions b/‎countess/plugins/csv.py‎
Lines changed: 42 additions & 125 deletions
@@ -471,10 +471,10 @@ def copy(self) -> "ChoiceParam":
 
 class DataTypeChoiceParam(ChoiceParam):
     DATA_TYPES: Mapping[str, tuple[type, Any, Type[ScalarParam]]] = {
-        "string": (str, "", StringParam),
-        "number": (float, math.nan, FloatParam),
-        "integer": (int, 0, IntegerParam),
-        "boolean": (bool, False, BooleanParam),
+        "VARCHAR": (str, "", StringParam),
+        "FLOAT": (float, math.nan, FloatParam),
+        "INTEGER": (int, 0, IntegerParam),
+        "BOOLEAN": (bool, False, BooleanParam),
     }
 
     def __init__(self, label: str, value: Optional[str] = None, choices: Optional[Iterable[str]] = None):
 
@@ -83,7 +83,7 @@ def run(self, ddbc):
 
         assert isinstance(self.plugin, DuckdbPlugin)
         if self.is_dirty:
-            sources = [pn.run(ddbc) for pn in self.parent_nodes]
+            sources = {pn.name: pn.run(ddbc) for pn in self.parent_nodes }
             ddbc.sql(f"DROP TABLE IF EXISTS n_{self.uuid}")
             self.plugin.execute_multi(ddbc, sources).to_table(f"n_{self.uuid}")
             self.result = ddbc.table(f"n_{self.uuid}")
@@ -176,9 +176,7 @@ def run(self):
         start_time = time.time()
         for node in self.traverse_nodes():
             node.load_config()
-            node.result = node.plugin.execute_multi(ddbc, [pn.result for pn in node.parent_nodes])
-            logger.debug("Got result ...")
-            logger.debug("... %d", len(node.result))
+            node.result = node.plugin.execute_multi(ddbc, {pn.name: pn.result for pn in node.parent_nodes})
 
         logger.info("Finished, elapsed time: %d", time.time() - start_time)
 
 
@@ -19,8 +19,9 @@
 import importlib
 import importlib.metadata
 import logging
-from typing import Any, Iterable, List, Optional, Sequence, Type, Union
+from typing import Any, Iterable, List, Optional, Sequence, Tuple, Type, Union, Mapping
 
+import duckdb
 from duckdb import DuckDBPyConnection, DuckDBPyRelation
 
 from countess.core.parameters import BaseParam, FileArrayParam, FileParam, HasSubParametersMixin, MultiParam
@@ -109,16 +110,17 @@ class DuckdbPlugin(BasePlugin):
     # XXX expand this, or find in library somewhere
     ALLOWED_TYPES = {"INTEGER", "VARCHAR", "FLOAT"}
 
-    def execute_multi(self, ddbc: DuckDBPyConnection, sources: List[DuckDBPyRelation]) -> Optional[DuckDBPyRelation]:
+    def execute_multi(self, ddbc: DuckDBPyConnection, sources: Mapping[str, DuckDBPyRelation]) -> Optional[DuckDBPyRelation]:
         raise NotImplementedError(f"{self.__class__}.execute_multi")
 
 
 class DuckdbSimplePlugin(DuckdbPlugin):
-    def execute_multi(self, ddbc: DuckDBPyConnection, sources: List[DuckDBPyRelation]) -> Optional[DuckDBPyRelation]:
+    def execute_multi(self, ddbc: DuckDBPyConnection, sources: Mapping[str, DuckDBPyRelation]) -> Optional[DuckDBPyRelation]:
+        tables = list(sources.values())
         if len(sources) > 1:
-            return self.execute(ddbc, duckdb_concatenate(sources))
+            return self.execute(ddbc, duckdb_concatenate(tables))
         elif len(sources) == 1:
-            return self.execute(ddbc, sources[0])
+            return self.execute(ddbc, tables[0])
         else:
             return self.execute(ddbc, None)
 
@@ -175,6 +177,10 @@ def load_file(
         raise NotImplementedError(f"{self.__class__}.load_file")
 
 
+class DuckdbSaveFilePlugin(DuckdbSimplePlugin):
+    num_outputs = 0
+
+
 class DuckdbFilterPlugin(DuckdbSimplePlugin):
     def input_columns(self) -> dict[str, str]:
         raise NotImplementedError(f"{self.__class__}.input_columns")
@@ -267,27 +273,30 @@ def execute(self, ddbc, source):
         logger.debug("DuckDbTransformPlugin.query output_type %s", output_type)
         logger.debug("DuckDbTransformPlugin.query project_fields %s", project_fields)
 
+        # if the function already exists, remove it
         try:
-            ddbc.create_function(
-                name=function_name,
-                function=self.transform_tuple,
-                parameters=input_types,
-                return_type=output_type,
-                null_handling="special",
-                side_effects=False,
-            )
-            return source.project(project_fields)
-        finally:
+            ddbc.remove_function(function_name)
+        except duckdb.InvalidInputException:
+            # it didn't exist
             pass
-            # ddbc.remove_function(function_name)
+
+        ddbc.create_function(
+            name=function_name,
+            function=self.transform_tuple,
+            parameters=input_types,
+            return_type=output_type,
+            null_handling="special",
+            side_effects=False,
+        )
+        return source.project(project_fields)
 
     def transform_tuple(self, *data):
         logger.debug("DuckDbTransformPlugin.transform_tuple %s", data)
         r = self.transform(dict(zip([k for k in self.input_columns().keys() if k is not None], data)))
         logger.debug("DuckDbTransformPlugin.transform_tuple %s", r)
         return r
 
-    def transform(self, data: dict[str, Any]):
+    def transform(self, data: dict[str, Any]) -> Union[dict[str, Any], Tuple[Any], None]:
         """This will be called for each row, with the columns nominated in
         `self.input_columns` as parameters.  Return a tuple with the same
         value types as (or a dictionary with the same keys and value types as)
 
@@ -2,6 +2,7 @@
 import time
 import tkinter as tk
 from functools import partial
+import logging
 from math import ceil, floor, isinf, isnan
 from tkinter import ttk
 from typing import Callable, Optional, Union
@@ -11,6 +12,9 @@
 from countess.gui.widgets import ResizingFrame, copy_to_clipboard, get_icon
 from countess.utils.duckdb import duckdb_dtype_is_integer, duckdb_dtype_is_numeric, duckdb_escape_identifier
 
+
+logger = logging.getLogger(__name__)
+
 # XXX columns should automatically resize based on information
 # from _column_xscrollcommand which can tell if they're
 # overflowing.  Or maybe use
@@ -19,12 +23,21 @@
 
 
 def column_format_for(table: DuckDBPyRelation, column: str) -> str:
-    dtype = table[column].dtypes[0]
+    #logger.debug("column_format_for column %s %s", column, table.columns)
+
+    # XXX https://github.com/duckdb/duckdb/issues/15267
+    dtype = table.project(duckdb_escape_identifier(column)).dtypes[0]
+
+    logger.debug("column_format_for dtype %s", dtype)
+
     if duckdb_dtype_is_numeric(dtype):
         # Work out the maximum width required to represent the integer part in this
         # column, so we can pad values to that width.
         column_esc = duckdb_escape_identifier(column)
-        column_min, column_max = table.aggregate(f"min({column_esc}), max({column_esc})").fetchone()
+        column_min_max = table.aggregate(f"min({column_esc}), max({column_esc})").fetchone()
+        if column_min_max is None:
+            return "%s"
+        column_min, column_max = column_min_max
         if column_min is None or isnan(column_min) or isinf(column_min):
             column_min = -100
         if column_max is None or isnan(column_max) or isinf(column_max):
@@ -191,12 +204,9 @@ def refresh(self, new_offset=0):
         # with some window managers. Needs refactoring.
 
         new_offset = max(0, min(self.length - self.height, int(new_offset)))
-        offset_diff = new_offset - self.offset
 
         rows = self.table.limit(self.height, offset=new_offset).fetchall()
-        for column_num, (column_name, column_widget, column_format) in enumerate(
-            zip(self.table.columns, self.columns, self.column_formats)
-        ):
+        for column_num, (column_widget, column_format) in enumerate(zip(self.columns, self.column_formats)):
             column_widget["state"] = tk.NORMAL
             column_widget.delete("1.0", tk.END)
             for row in rows:
@@ -217,6 +227,7 @@ def set_click_callback(self, click_callback) -> None:
         self.click_callback = click_callback
 
     def set_sort_order(self, column_num: int, descending: Optional[bool] = None):
+        assert self.ddbc is not None
         assert self.table is not None
 
         if descending is None and column_num == self.sort_by_col:
 
@@ -5,7 +5,8 @@
 from io import BufferedWriter, BytesIO
 from typing import Any, List, Optional, Sequence, Tuple, Union
 
-import pandas as pd
+import duckdb
+from duckdb import DuckDBPyConnection, DuckDBPyRelation
 
 from countess import VERSION
 from countess.core.parameters import (
@@ -18,9 +19,9 @@
     MultiParam,
     StringParam,
 )
-from countess.core.plugins import PandasInputFilesPlugin, PandasOutputPlugin
+from countess.core.plugins import DuckdbLoadFilePlugin, DuckdbSaveFilePlugin
 from countess.utils.files import clean_filename
-from countess.utils.pandas import flatten_columns
+from countess.utils.duckdb import duckdb_escape_literal, duckdb_escape_identifier
 
 CSV_FILE_TYPES: Sequence[Tuple[str, Union[str, List[str]]]] = [
     ("CSV", [".csv", ".csv.gz"]),
@@ -34,10 +35,18 @@
 class ColumnsMultiParam(MultiParam):
     name = StringParam("Column Name", "")
     type = DataTypeOrNoneChoiceParam("Column Type")
-    index = BooleanParam("Index?", False)
 
 
-class LoadCsvPlugin(PandasInputFilesPlugin):
+CSV_DELIMITER_CHOICES = {
+    ',': ',',
+    ';': ';',
+    '|': '|',
+    'TAB': '\t',
+    'SPACE': ' ',
+    'NONE': None
+}
+
+class LoadCsvPlugin(DuckdbLoadFilePlugin):
     """Load CSV files"""
 
     name = "CSV Load"
@@ -46,78 +55,42 @@ class LoadCsvPlugin(PandasInputFilesPlugin):
     version = VERSION
     file_types = CSV_FILE_TYPES
 
-    delimiter = ChoiceParam("Delimiter", ",", choices=[",", ";", "TAB", "|", "WHITESPACE"])
-    quoting = ChoiceParam("Quoting", "None", choices=["None", "Double-Quote", "Quote with Escape"])
-    comment = ChoiceParam("Comment", "None", choices=["None", "#", ";"])
+    delimiter = ChoiceParam("Delimiter", ",", choices=CSV_DELIMITER_CHOICES.keys())
     header = BooleanParam("CSV file has header row?", True)
     filename_column = StringParam("Filename Column", "")
     columns = ArrayParam("Columns", ColumnsMultiParam("Column"))
 
-    def read_file_to_dataframe(self, filename: str, file_param: BaseParam, row_limit=None):
-        options: dict[str, Any] = {
-            "header": 0 if self.header else None,
-        }
-        if row_limit is not None:
-            options["nrows"] = row_limit
-
-        index_col_numbers = []
-
-        if len(self.columns):
-            options["names"] = []
-            options["usecols"] = []
-            options["converters"] = {}
-
-            for n, pp in enumerate(self.columns):
-                options["names"].append(str(pp.name) or f"column_{n}")
-                if pp.type.is_not_none():
-                    if pp.index:
-                        index_col_numbers.append(len(options["usecols"]))
-                    options["usecols"].append(n)
-                    options["converters"][n] = pp["type"].cast_value
-
-        if self.delimiter == "TAB":
-            options["delimiter"] = "\t"
-        elif self.delimiter == "WHITESPACE":
-            options["delim_whitespace"] = True
+    def load_file(
+        self, cursor: DuckDBPyConnection, filename: str, file_param: BaseParam, file_number: int
+    ) -> duckdb.DuckDBPyRelation:
+        if self.header and len(self.columns) == 0:
+            table = cursor.read_csv(
+                filename,
+                header = True,
+                delimiter = CSV_DELIMITER_CHOICES[self.delimiter.value],
+            )
+            for column_name, column_dtype in zip(table.columns, table.dtypes):
+                column_param = self.columns.add_row()
+                column_param.name.value = column_name
+                column_param.type.value = str(column_dtype)
         else:
-            options["delimiter"] = str(self.delimiter)
-
-        if self.quoting == "None":
-            options["quoting"] = csv.QUOTE_NONE
-        elif self.quoting == "Double-Quote":
-            options["quotechar"] = '"'
-            options["doublequote"] = True
-        elif self.quoting == "Quote with Escape":
-            options["quotechar"] = '"'
-            options["doublequote"] = False
-            options["escapechar"] = "\\"
-
-        if self.comment.value != "None":
-            options["comment"] = str(self.comment)
-
-        # XXX pd.read_csv(index_col=) is half the speed of pd.read_csv().set_index()
-
-        df = pd.read_csv(filename, **options)
-
-        while len(df.columns) > len(self.columns):
-            self.columns.add_row()
-
-        if self.header:
-            for n, col in enumerate(df.columns):
-                if not self.columns[n].name:
-                    self.columns[n].name = str(col)
-                    self.columns[n].type = "string"
+            table = cursor.read_csv(
+                filename,
+                header = False,
+                skiprows = 1 if self.header else 0,
+                delimiter = CSV_DELIMITER_CHOICES[self.delimiter.value],
+                columns = { str(c.name): str(c.type) for c in self.columns } if self.columns else None
+            )
 
         if self.filename_column:
-            df[str(self.filename_column)] = clean_filename(filename)
-
-        if index_col_numbers:
-            df = df.set_index([df.columns[n] for n in index_col_numbers])
+            escaped_filename = duckdb_escape_literal(clean_filename(filename))
+            escaped_column = duckdb_escape_identifier(self.filename_column.value)
+            table = table.project(f"*, {escaped_filename} AS {escaped_column}")
 
-        return df
+        return table
 
 
-class SaveCsvPlugin(PandasOutputPlugin):
+class SaveCsvPlugin(DuckdbSaveFilePlugin):
     name = "CSV Save"
     description = "Save data as CSV or similar delimited text files"
     link = "https://countess-project.github.io/CountESS/included-plugins/#csv-writer"
@@ -135,61 +108,5 @@ class SaveCsvPlugin(PandasOutputPlugin):
     SEPARATORS = {",": ",", ";": ";", "SPACE": " ", "TAB": "\t"}
     QUOTING = {False: csv.QUOTE_MINIMAL, True: csv.QUOTE_NONNUMERIC}
 
-    def prepare(self, sources: list[str], row_limit: Optional[int] = None):
-        if row_limit is None:
-            logger.debug("SaveCsvPlugin.process %s prepare %s", self.name, self.filename)
-            filename = str(self.filename)
-            if filename.endswith(".gz"):
-                self.filehandle = gzip.open(filename, "wb")
-            elif filename.endswith(".bz2"):
-                self.filehandle = bz2.open(filename, "wb")
-            else:
-                self.filehandle = open(filename, "wb")
-        else:
-            logger.debug("SaveCsvPlugin.process %s prepare BytesIO", self.name)
-            self.filehandle = BytesIO()
-
-        self.csv_columns = None
-
-    def process(self, data: pd.DataFrame, source: str):
-        # reset indexes so we can treat all columns equally.
-        # if there's just a nameless index then we don't care about it, drop it.
-        drop_index = data.index.name is None and data.index.names[0] is None
-        dataframe = flatten_columns(data.reset_index(drop=drop_index))
-
-        # if this is our first dataframe to write then decide whether to
-        # include the header or not.
-        if self.csv_columns is None:
-            self.csv_columns = list(dataframe.columns)
-            emit_header = bool(self.header)
-        else:
-            # add in any columns we haven't seen yet in previous dataframes.
-            for c in dataframe.columns:
-                if c not in self.csv_columns:
-                    self.csv_columns.append(c)
-                    logger.warning("Added CSV Column %s with no header", repr(c))
-            # fill in blanks for any columns which are in previous dataframes but not
-            # in this one.
-            dataframe = dataframe.assign(**{c: None for c in self.csv_columns if c not in dataframe.columns})
-            emit_header = False
-
-        logger.debug(
-            "SaveCsvPlugin.process %s writing rows %d columns %d", self.name, len(dataframe), len(self.csv_columns)
-        )
-
-        dataframe.to_csv(
-            self.filehandle,
-            header=emit_header,
-            columns=self.csv_columns,
-            index=False,
-            sep=self.SEPARATORS[str(self.delimiter)],
-            quoting=self.QUOTING[bool(self.quoting)],
-        )  # type: ignore [call-overload]
-        return []
-
-    def finalize(self):
-        logger.debug("SaveCsvPlugin.process %s finalize", self.name)
-        if isinstance(self.filehandle, BytesIO):
-            yield self.filehandle.getvalue().decode("utf-8")
-        else:
-            self.filehandle.close()
+    def execute(self, ddbc: DuckDBPyConnection, source: Optional[DuckDBPyRelation]) -> Optional  [DuckDBPyRelation]:
+        pass