Enable path ruff check

timsaucer · timsaucer · commit 63fbcf394d7c · 2025-10-30T07:23:37.000-04:00
diff --git a/benchmarks/db-benchmark/groupby-datafusion.py b/benchmarks/db-benchmark/groupby-datafusion.py
@@ -18,6 +18,7 @@
 import gc
 import os
 import timeit
+from pathlib import Path
 
 import datafusion as df
 import pyarrow as pa
@@ -34,7 +35,7 @@
 
 print("# groupby-datafusion.py", flush=True)
 
-exec(open("./_helpers/helpers.py").read())
+exec(Path.open("./_helpers/helpers.py").read())
 
 
 def ans_shape(batches) -> tuple[int, int]:
@@ -65,7 +66,7 @@ def execute(df) -> list:
 sql = True
 
 data_name = os.environ["SRC_DATANAME"]
-src_grp = os.path.join("data", data_name + ".csv")
+src_grp = "data" / data_name / ".csv"
 print("loading dataset %s" % src_grp, flush=True)
 
 schema = pa.schema(
diff --git a/benchmarks/db-benchmark/join-datafusion.py b/benchmarks/db-benchmark/join-datafusion.py
@@ -18,6 +18,7 @@
 import gc
 import os
 import timeit
+from pathlib import Path
 
 import datafusion as df
 from datafusion import col
@@ -26,7 +27,7 @@
 
 print("# join-datafusion.py", flush=True)
 
-exec(open("./_helpers/helpers.py").read())
+exec(Path.open("./_helpers/helpers.py").read())
 
 
 def ans_shape(batches) -> tuple[int, int]:
@@ -49,12 +50,12 @@ def ans_shape(batches) -> tuple[int, int]:
 on_disk = "FALSE"
 
 data_name = os.environ["SRC_DATANAME"]
-src_jn_x = os.path.join("data", data_name + ".csv")
+src_jn_x = "data" / data_name / ".csv"
 y_data_name = join_to_tbls(data_name)
 src_jn_y = [
-    os.path.join("data", y_data_name[0] + ".csv"),
-    os.path.join("data", y_data_name[1] + ".csv"),
-    os.path.join("data", y_data_name[2] + ".csv"),
+    "data" / y_data_name[0] / ".csv",
+    "data" / y_data_name[1] / ".csv",
+    "data" / y_data_name[2] / ".csv",
 ]
 if len(src_jn_y) != 3:
     error_msg = "Something went wrong in preparing files used for join"
diff --git a/benchmarks/tpch/tpch.py b/benchmarks/tpch/tpch.py
@@ -17,12 +17,13 @@
 
 import argparse
 import time
+from pathlib import Path
 
 from datafusion import SessionContext
 
 
 def bench(data_path, query_path) -> None:
-    with open("results.csv", "w") as results:
+    with Path.open("results.csv", "w") as results:
         # register tables
         start = time.time()
         total_time_millis = 0
@@ -45,7 +46,7 @@ def bench(data_path, query_path) -> None:
         print("Configuration:\n", ctx)
 
         # register tables
-        with open("create_tables.sql") as f:
+        with Path.open("create_tables.sql") as f:
             sql = ""
             for line in f.readlines():
                 if line.startswith("--"):
@@ -65,7 +66,7 @@ def bench(data_path, query_path) -> None:
 
         # run queries
         for query in range(1, 23):
-            with open(f"{query_path}/q{query}.sql") as f:
+            with Path.open(f"{query_path}/q{query}.sql") as f:
                 text = f.read()
                 tmp = text.split(";")
                 queries = [s.strip() for s in tmp if len(s.strip()) > 0]
diff --git a/dev/create_license.py b/dev/create_license.py
@@ -20,6 +20,7 @@
 
 import json
 import subprocess
+from pathlib import Path
 
 subprocess.check_output(["cargo", "install", "cargo-license"])
 data = subprocess.check_output(
@@ -248,5 +249,5 @@
     result += "------------------\n\n"
     result += f"### {name} {version}\n* source: [{repository}]({repository})\n* license: {license}\n\n"
 
-with open("LICENSE.txt", "w") as f:
+with Path.open("LICENSE.txt", "w") as f:
     f.write(result)
diff --git a/dev/release/check-rat-report.py b/dev/release/check-rat-report.py
@@ -21,6 +21,7 @@
 import re
 import sys
 import xml.etree.ElementTree as ET
+from pathlib import Path
 
 if len(sys.argv) != 3:
     sys.stderr.write("Usage: %s exclude_globs.lst rat_report.xml\n" % sys.argv[0])
@@ -29,7 +30,7 @@
 exclude_globs_filename = sys.argv[1]
 xml_filename = sys.argv[2]
 
-globs = [line.strip() for line in open(exclude_globs_filename)]
+globs = [line.strip() for line in Path.open(exclude_globs_filename)]
 
 tree = ET.parse(xml_filename)
 root = tree.getroot()
diff --git a/examples/python-udf-comparisons.py b/examples/python-udf-comparisons.py
@@ -15,16 +15,16 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import os
 import time
+from pathlib import Path
 
 import pyarrow as pa
 import pyarrow.compute as pc
 from datafusion import SessionContext, col, lit, udf
 from datafusion import functions as F
 
-path = os.path.dirname(os.path.abspath(__file__))
-filepath = os.path.join(path, "./tpch/data/lineitem.parquet")
+path = Path(__file__).parent.resolve()
+filepath = path / "./tpch/data/lineitem.parquet"
 
 # This example serves to demonstrate alternate approaches to answering the
 # question "return all of the rows that have a specific combination of these
diff --git a/examples/tpch/convert_data_to_parquet.py b/examples/tpch/convert_data_to_parquet.py
@@ -22,7 +22,7 @@
 as will be generated by the script provided in this repository.
 """
 
-import os
+from pathlib import Path
 
 import datafusion
 import pyarrow as pa
@@ -116,7 +116,7 @@
     ("S_COMMENT", pa.string()),
 ]
 
-curr_dir = os.path.dirname(os.path.abspath(__file__))
+curr_dir = Path(__file__).resolve().parent
 for filename, curr_schema_val in all_schemas.items():
     # For convenience, go ahead and convert the schema column names to lowercase
     curr_schema = [(s[0].lower(), s[1]) for s in curr_schema_val]
@@ -132,10 +132,8 @@
 
     schema = pa.schema(curr_schema)
 
-    source_file = os.path.abspath(
-        os.path.join(curr_dir, f"../../benchmarks/tpch/data/{filename}.csv")
-    )
-    dest_file = os.path.abspath(os.path.join(curr_dir, f"./data/{filename}.parquet"))
+    source_file = (curr_dir / f"../../benchmarks/tpch/data/{filename}.csv").resolve()
+    dest_file = (curr_dir / f"./data/{filename}.parquet").resolve()
 
     df = ctx.read_csv(source_file, schema=schema, has_header=False, delimiter="|")
 
diff --git a/examples/tpch/util.py b/examples/tpch/util.py
@@ -19,18 +19,16 @@
 Common utilities for running TPC-H examples.
 """
 
-import os
+from pathlib import Path
 
 
-def get_data_path(filename: str) -> str:
-    path = os.path.dirname(os.path.abspath(__file__))
+def get_data_path(filename: str) -> Path:
+    path = Path(__file__).resolve().parent
 
-    return os.path.join(path, "data", filename)
+    return path / "data" / filename
 
 
-def get_answer_file(answer_file: str) -> str:
-    path = os.path.dirname(os.path.abspath(__file__))
+def get_answer_file(answer_file: str) -> Path:
+    path = Path(__file__).resolve().parent
 
-    return os.path.join(
-        path, "../../benchmarks/tpch/data/answers", f"{answer_file}.out"
-    )
+    return path / "../../benchmarks/tpch/data/answers" / f"{answer_file}.out"
diff --git a/pyproject.toml b/pyproject.toml
@@ -90,8 +90,6 @@ ignore = [
     "PD901",   # Allow variable name df
     "N812",    # Allow importing functions as `F`
     "A005",    # Allow module named io
-    # TODO: Enable all of the following, but this PR is getting too large already
-    "PTH",
 ]
 
 [tool.ruff.lint.pydocstyle]
diff --git a/python/datafusion/input/location.py b/python/datafusion/input/location.py
@@ -17,7 +17,6 @@
 
 """The default input source for DataFusion."""
 
-import glob
 from pathlib import Path
 from typing import Any
 
@@ -84,6 +83,6 @@ def build_table(
             raise RuntimeError(msg)
 
         # Input could possibly be multiple files. Create a list if so
-        input_files = glob.glob(input_item)
+        input_files = Path.glob(input_item)
 
         return SqlTable(table_name, columns, num_rows, input_files)
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
@@ -21,6 +21,7 @@
 import re
 import threading
 import time
+from pathlib import Path
 from typing import Any
 
 import pyarrow as pa
@@ -2413,11 +2414,11 @@ def test_write_parquet_with_options_bloom_filter(df, tmp_path):
 
     size_no_bloom_filter = 0
     for file in path_no_bloom_filter.rglob("*.parquet"):
-        size_no_bloom_filter += os.path.getsize(file)
+        size_no_bloom_filter += Path(file).stat().st_size
 
     size_bloom_filter = 0
     for file in path_bloom_filter.rglob("*.parquet"):
-        size_bloom_filter += os.path.getsize(file)
+        size_bloom_filter += Path(file).stat().st_size
 
     assert size_no_bloom_filter < size_bloom_filter
 

Original file line number	Diff line number	Diff line change
`@@ -90,8 +90,6 @@ ignore = [`
`90`	`90`	`"PD901", # Allow variable name df`
`91`	`91`	"N812", # Allow importing functions as `F`
`92`	`92`	`"A005", # Allow module named io`
`93`		`- # TODO: Enable all of the following, but this PR is getting too large already`
`94`		`- "PTH",`
`95`	`93`	`]`
`96`	`94`
`97`	`95`	`[tool.ruff.lint.pydocstyle]`