ci: update pre-commit hooks, fix linting, and refresh dependencies (#1385)

dariocurr · web-flow · commit e42775c2fcfe · 2026-02-26T09:13:38.000-05:00
* ci: update pre-commit hooks and fix linting issues

* Update Ruff version in pre-commit configuration to v0.15.1.
* Add noqa comments to suppress specific linting warnings in various files.
* Update regex patterns in test cases for better matching.

* style: correct indentation in GitHub Actions workflow file

* Adjusted indentation for the enable-cache option in the test.yml workflow file to ensure proper YAML formatting.

* refactor: reorder imports in indexed_field.rs for clarity

* Adjusted the order of imports in indexed_field.rs to improve readability and maintain consistency with project conventions.

* build: update dependencies in Cargo.toml and Cargo.lock

* Bump versions of several dependencies including tokio, pyo3-log, prost, uuid, and log to their latest releases.
* Update Cargo.lock to reflect the changes in dependency versions.

* style: format pyproject.toml for consistency

* Adjusted formatting in pyproject.toml for improved readability by aligning lists and ensuring consistent indentation.
* Updated dependencies and configuration settings for better organization.

* style: remove noqa comments for import statements

* Cleaned up import statements in multiple files by removing unnecessary noqa comments, enhancing code readability and maintaining consistency across the codebase.

* style: simplify formatting in pyproject.toml

* Streamlined list formatting in pyproject.toml for improved readability by removing unnecessary line breaks and ensuring consistent structure across sections.
* No functional changes were made; the focus was solely on code style and organization.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -67,7 +67,7 @@ jobs:
       - name: Install dependencies
         uses: astral-sh/setup-uv@v7
         with:
-            enable-cache: true
+          enable-cache: true
 
       # Download the Linux wheel built in the build workflow
       - name: Download pre-built Linux wheel
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
           - id: actionlint-docker
       - repo: https://github.com/astral-sh/ruff-pre-commit
         # Ruff version.
-        rev: v0.9.10
+        rev: v0.15.1
         hooks:
           # Run the linter.
           - id: ruff
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -42,7 +42,7 @@ protoc = ["datafusion-substrait/protoc"]
 substrait = ["dep:datafusion-substrait"]
 
 [dependencies]
-tokio = { version = "1.47", features = [
+tokio = { version = "1.49", features = [
   "macros",
   "rt",
   "rt-multi-thread",
@@ -54,16 +54,16 @@ pyo3 = { version = "0.26", features = [
   "abi3-py310",
 ] }
 pyo3-async-runtimes = { version = "0.26", features = ["tokio-runtime"] }
-pyo3-log = "0.13.2"
+pyo3-log = "0.13.3"
 arrow = { version = "57", features = ["pyarrow"] }
 arrow-select = { version = "57" }
 datafusion = { version = "52", features = ["avro", "unicode_expressions"] }
 datafusion-substrait = { version = "52", optional = true }
 datafusion-proto = { version = "52" }
 datafusion-ffi = { version = "52" }
-prost = "0.14.1" # keep in line with `datafusion-substrait`
+prost = "0.14.3" # keep in line with `datafusion-substrait`
 serde_json = "1"
-uuid = { version = "1.18", features = ["v4"] }
+uuid = { version = "1.21", features = ["v4"] }
 mimalloc = { version = "0.1", optional = true, default-features = false, features = [
   "local_dynamic_tls",
 ] }
@@ -77,11 +77,11 @@ object_store = { version = "0.12.4", features = [
   "http",
 ] }
 url = "2"
-log = "0.4.27"
+log = "0.4.29"
 parking_lot = "0.12"
 
 [build-dependencies]
-prost-types = "0.14.1"     # keep in line with `datafusion-substrait`
+prost-types = "0.14.3"     # keep in line with `datafusion-substrait`
 pyo3-build-config = "0.26"
 
 [lib]
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ description = "Build and run queries against data"
 readme = "README.md"
 license = { file = "LICENSE.txt" }
 requires-python = ">=3.10"
-keywords = ["datafusion", "dataframe", "rust", "query-engine"]
+keywords = ["dataframe", "datafusion", "query-engine", "rust"]
 classifiers = [
   "Development Status :: 2 - Pre-Alpha",
   "Intended Audience :: Developers",
@@ -62,7 +62,7 @@ profile = "black"
 python-source = "python"
 module-name = "datafusion._internal"
 include = [{ path = "Cargo.lock", format = "sdist" }]
-exclude = [".github/**", "ci/**", ".asf.yaml"]
+exclude = [".asf.yaml", ".github/**", "ci/**"]
 # Require Cargo.lock is up to date
 locked = true
 features = ["substrait"]
@@ -77,19 +77,19 @@ select = ["ALL"]
 ignore = [
   "A001",    # Allow using words like min as variable names
   "A002",    # Allow using words like filter as variable names
+  "A005",    # Allow module named io
   "ANN401",  # Allow Any for wrapper classes
   "COM812",  # Recommended to ignore these rules when using with ruff-format
-  "FIX002",  # Allow TODO lines - consider removing at some point
   "FBT001",  # Allow boolean positional args
   "FBT002",  # Allow boolean positional args
+  "FIX002",  # Allow TODO lines - consider removing at some point
   "ISC001",  # Recommended to ignore these rules when using with ruff-format
+  "N812",    # Allow importing functions as `F`
+  "PD901",   # Allow variable name df
+  "PLR0913", # Allow many arguments in function definition
   "SLF001",  # Allow accessing private members
   "TD002",   # Do not require author names in TODO statements
   "TD003",   # Allow TODO lines
-  "PLR0913", # Allow many arguments in function definition
-  "PD901",   # Allow variable name df
-  "N812",    # Allow importing functions as `F`
-  "A005",    # Allow module named io
 ]
 
 [tool.ruff.lint.pydocstyle]
@@ -99,7 +99,7 @@ convention = "google"
 max-doc-length = 88
 
 [tool.ruff.lint.flake8-boolean-trap]
-extend-allowed-calls = ["lit", "datafusion.lit"]
+extend-allowed-calls = ["datafusion.lit", "lit"]
 
 # Disable docstring checking for these directories
 [tool.ruff.lint.per-file-ignores]
@@ -108,68 +108,69 @@ extend-allowed-calls = ["lit", "datafusion.lit"]
   "ARG",
   "BLE001",
   "D",
-  "S101",
-  "SLF",
   "PD",
+  "PLC0415",
+  "PLR0913",
   "PLR2004",
+  "PT004",
   "PT011",
   "RUF015",
+  "S101",
   "S608",
-  "PLR0913",
-  "PT004",
+  "SLF",
 ]
 "examples/*" = [
-  "D",
-  "W505",
-  "E501",
-  "T201",
-  "S101",
-  "PLR2004",
   "ANN001",
   "ANN202",
-  "INP001",
+  "D",
   "DTZ007",
+  "E501",
+  "INP001",
+  "PLR2004",
   "RUF015",
+  "S101",
+  "T201",
+  "W505",
 ]
 "dev/*" = [
+  "ANN001",
+  "C",
   "D",
   "E",
-  "T",
-  "S",
+  "ERA001",
+  "EXE",
+  "N817",
   "PLR",
-  "C",
+  "S",
   "SIM",
+  "T",
   "UP",
-  "EXE",
-  "N817",
-  "ERA001",
-  "ANN001",
 ]
 "benchmarks/*" = [
+  "ANN001",
+  "BLE",
   "D",
+  "E",
+  "ERA001",
+  "EXE",
   "F",
-  "T",
-  "BLE",
   "FURB",
+  "INP001",
   "PLR",
-  "E",
-  "TD",
-  "TRY",
   "S",
   "SIM",
-  "EXE",
+  "T",
+  "TD",
+  "TRY",
   "UP",
-  "ERA001",
-  "ANN001",
-  "INP001",
 ]
 "docs/*" = ["D"]
-"docs/source/conf.py" = ["ERA001", "ANN001", "INP001"]
+"docs/source/conf.py" = ["ANN001", "ERA001", "INP001"]
 
 [tool.codespell]
-skip = ["./target", "uv.lock", "./python/tests/test_functions.py"]
+skip = ["./python/tests/test_functions.py", "./target", "uv.lock"]
 count = true
-ignore-words-list = ["ans", "IST"]
+ignore-words-list = ["IST", "ans"]
 
 [dependency-groups]
 dev = [
@@ -182,8 +183,8 @@ dev = [
   "pre-commit>=4.3.0",
   "pyarrow>=19.0.0",
   "pygithub==2.5.0",
-  "pytest>=7.4.4",
   "pytest-asyncio>=0.23.3",
+  "pytest>=7.4.4",
   "pyyaml>=6.0.3",
   "ruff>=0.9.1",
   "toml>=0.10.2",
@@ -196,6 +197,6 @@ docs = [
   "pickleshare>=0.7.5",
   "pydata-sphinx-theme==0.8.0",
   "setuptools>=75.3.0",
-  "sphinx>=7.1.2",
   "sphinx-autoapi>=3.4.0",
+  "sphinx>=7.1.2",
 ]
diff --git a/python/datafusion/expr.py b/python/datafusion/expr.py
@@ -20,6 +20,8 @@
 See :ref:`Expressions` in the online documentation for more details.
 """
 
+# ruff: noqa: PLC0415
+
 from __future__ import annotations
 
 from collections.abc import Iterable, Sequence
diff --git a/python/datafusion/user_defined.py b/python/datafusion/user_defined.py
@@ -583,11 +583,11 @@ def from_pycapsule(func: AggregateUDFExportable | _PyCapsule) -> AggregateUDF:
         AggregateUDF that is exported via the FFI bindings.
         """
         if _is_pycapsule(func):
-            aggregate = cast(AggregateUDF, object.__new__(AggregateUDF))
+            aggregate = cast("AggregateUDF", object.__new__(AggregateUDF))
             aggregate._udaf = df_internal.AggregateUDF.from_pycapsule(func)
             return aggregate
 
-        capsule = cast(AggregateUDFExportable, func)
+        capsule = cast("AggregateUDFExportable", func)
         name = str(capsule.__class__)
         return AggregateUDF(
             name=name,
diff --git a/python/tests/test_catalog.py b/python/tests/test_catalog.py
@@ -248,7 +248,7 @@ def test_exception_not_mangled(ctx: SessionContext):
 
     schema.register_table("test_table", create_dataset())
 
-    with pytest.raises(ValueError, match="^test_table is not an acceptable name$"):
+    with pytest.raises(ValueError, match=r"^test_table is not an acceptable name$"):
         ctx.sql(f"select * from {catalog_name}.{schema_name}.test_table")
 
 
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -2790,7 +2790,7 @@ def test_write_parquet_with_options_encoding(tmp_path, encoding, data_types, res
 def test_write_parquet_with_options_unsupported_encoding(df, tmp_path, encoding):
     """Test that unsupported Parquet encodings do not work."""
     # BaseException is used since this throws a Rust panic: https://github.com/PyO3/pyo3/issues/3519
-    with pytest.raises(BaseException, match="Encoding .*? is not supported"):
+    with pytest.raises(BaseException, match=r"Encoding .*? is not supported"):
         df.write_parquet_with_options(tmp_path, ParquetWriterOptions(encoding=encoding))
 
 
diff --git a/python/tests/test_functions.py b/python/tests/test_functions.py
@@ -303,31 +303,31 @@ def py_flatten(arr):
             lambda data: [np.concatenate([arr, arr]) for arr in data],
         ),
         (
-            lambda col: f.array_dims(col),
+            f.array_dims,
             lambda data: [[len(r)] for r in data],
         ),
         (
-            lambda col: f.array_distinct(col),
+            f.array_distinct,
             lambda data: [list(set(r)) for r in data],
         ),
         (
-            lambda col: f.list_distinct(col),
+            f.list_distinct,
             lambda data: [list(set(r)) for r in data],
         ),
         (
-            lambda col: f.list_dims(col),
+            f.list_dims,
             lambda data: [[len(r)] for r in data],
         ),
         (
             lambda col: f.array_element(col, literal(1)),
             lambda data: [r[0] for r in data],
         ),
         (
-            lambda col: f.array_empty(col),
+            f.array_empty,
             lambda data: [len(r) == 0 for r in data],
         ),
         (
-            lambda col: f.empty(col),
+            f.empty,
             lambda data: [len(r) == 0 for r in data],
         ),
         (
@@ -343,11 +343,11 @@ def py_flatten(arr):
             lambda data: [r[0] for r in data],
         ),
         (
-            lambda col: f.array_length(col),
+            f.array_length,
             lambda data: [len(r) for r in data],
         ),
         (
-            lambda col: f.list_length(col),
+            f.list_length,
             lambda data: [len(r) for r in data],
         ),
         (
@@ -391,11 +391,11 @@ def py_flatten(arr):
             lambda data: [[i + 1 for i, _v in enumerate(r) if _v == 1.0] for r in data],
         ),
         (
-            lambda col: f.array_ndims(col),
+            f.array_ndims,
             lambda data: [np.array(r).ndim for r in data],
         ),
         (
-            lambda col: f.list_ndims(col),
+            f.list_ndims,
             lambda data: [np.array(r).ndim for r in data],
         ),
         (
@@ -415,11 +415,11 @@ def py_flatten(arr):
             lambda data: [np.insert(arr, 0, 99.0) for arr in data],
         ),
         (
-            lambda col: f.array_pop_back(col),
+            f.array_pop_back,
             lambda data: [arr[:-1] for arr in data],
         ),
         (
-            lambda col: f.array_pop_front(col),
+            f.array_pop_front,
             lambda data: [arr[1:] for arr in data],
         ),
         (
diff --git a/python/tests/test_sql.py b/python/tests/test_sql.py
@@ -31,7 +31,7 @@
 def test_no_table(ctx):
     with pytest.raises(
         ValueError,
-        match="^Error during planning: table 'datafusion.public.b' not found$",
+        match=r"^Error during planning: table 'datafusion.public.b' not found$",
     ):
         ctx.sql("SELECT a FROM b").collect()
 
diff --git a/python/tests/test_udf.py b/python/tests/test_udf.py
diff --git a/python/tests/test_udwf.py b/python/tests/test_udwf.py
diff --git a/src/expr/indexed_field.rs b/src/expr/indexed_field.rs

Original file line number	Diff line number	Diff line change
`@@ -303,31 +303,31 @@ def py_flatten(arr):`
`303`	`303`	`lambda data: [np.concatenate([arr, arr]) for arr in data],`
`304`	`304`	`),`
`305`	`305`	`(`
`306`		`- lambda col: f.array_dims(col),`
	`306`	`+ f.array_dims,`
`307`	`307`	`lambda data: [[len(r)] for r in data],`
`308`	`308`	`),`
`309`	`309`	`(`
`310`		`- lambda col: f.array_distinct(col),`
	`310`	`+ f.array_distinct,`
`311`	`311`	`lambda data: [list(set(r)) for r in data],`
`312`	`312`	`),`
`313`	`313`	`(`
`314`		`- lambda col: f.list_distinct(col),`
	`314`	`+ f.list_distinct,`
`315`	`315`	`lambda data: [list(set(r)) for r in data],`
`316`	`316`	`),`
`317`	`317`	`(`
`318`		`- lambda col: f.list_dims(col),`
	`318`	`+ f.list_dims,`
`319`	`319`	`lambda data: [[len(r)] for r in data],`
`320`	`320`	`),`
`321`	`321`	`(`
`322`	`322`	`lambda col: f.array_element(col, literal(1)),`
`323`	`323`	`lambda data: [r[0] for r in data],`
`324`	`324`	`),`
`325`	`325`	`(`
`326`		`- lambda col: f.array_empty(col),`
	`326`	`+ f.array_empty,`
`327`	`327`	`lambda data: [len(r) == 0 for r in data],`
`328`	`328`	`),`
`329`	`329`	`(`
`330`		`- lambda col: f.empty(col),`
	`330`	`+ f.empty,`
`331`	`331`	`lambda data: [len(r) == 0 for r in data],`
`332`	`332`	`),`
`333`	`333`	`(`
`@@ -343,11 +343,11 @@ def py_flatten(arr):`
`343`	`343`	`lambda data: [r[0] for r in data],`
`344`	`344`	`),`
`345`	`345`	`(`
`346`		`- lambda col: f.array_length(col),`
	`346`	`+ f.array_length,`
`347`	`347`	`lambda data: [len(r) for r in data],`
`348`	`348`	`),`
`349`	`349`	`(`
`350`		`- lambda col: f.list_length(col),`
	`350`	`+ f.list_length,`
`351`	`351`	`lambda data: [len(r) for r in data],`
`352`	`352`	`),`
`353`	`353`	`(`
`@@ -391,11 +391,11 @@ def py_flatten(arr):`
`391`	`391`	`lambda data: [[i + 1 for i, _v in enumerate(r) if _v == 1.0] for r in data],`
`392`	`392`	`),`
`393`	`393`	`(`
`394`		`- lambda col: f.array_ndims(col),`
	`394`	`+ f.array_ndims,`
`395`	`395`	`lambda data: [np.array(r).ndim for r in data],`
`396`	`396`	`),`
`397`	`397`	`(`
`398`		`- lambda col: f.list_ndims(col),`
	`398`	`+ f.list_ndims,`
`399`	`399`	`lambda data: [np.array(r).ndim for r in data],`
`400`	`400`	`),`
`401`	`401`	`(`
`@@ -415,11 +415,11 @@ def py_flatten(arr):`
`415`	`415`	`lambda data: [np.insert(arr, 0, 99.0) for arr in data],`
`416`	`416`	`),`
`417`	`417`	`(`
`418`		`- lambda col: f.array_pop_back(col),`
	`418`	`+ f.array_pop_back,`
`419`	`419`	`lambda data: [arr[:-1] for arr in data],`
`420`	`420`	`),`
`421`	`421`	`(`
`422`		`- lambda col: f.array_pop_front(col),`
	`422`	`+ f.array_pop_front,`
`423`	`423`	`lambda data: [arr[1:] for arr in data],`
`424`	`424`	`),`
`425`	`425`	`(`