Add typos check to pre-commit hooks (#10040)

max-sixty · web-flow · commit 4d8bbeea82f1 · 2025-03-06T18:28:16.000-08:00
* Add typos check to pre-commit hooks

Also fixes a bunch of typos. The work here was adding the exclusions (I had an LLM do them but also checked them)
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -73,3 +73,9 @@ repos:
     hooks:
       - id: validate-pyproject
         additional_dependencies: ["validate-pyproject-schema-store[all]"]
+  - repo: https://github.com/crate-ci/typos
+    rev: dictgen-v0.3.1
+    hooks:
+      - id: typos
+        # https://github.com/crate-ci/typos/issues/347
+        pass_filenames: false
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -111,7 +111,7 @@ class can be passed through the ``decode_times`` keyword argument (see also
     coder = xr.coders.CFDatetimeCoder(time_unit="s")
     ds = xr.open_dataset(filename, decode_times=coder)
 
-Similar control of the resoution of decoded timedeltas can be achieved through
+Similar control of the resolution of decoded timedeltas can be achieved through
 passing a :py:class:`coders.CFTimedeltaCoder` instance to the
 ``decode_timedelta`` keyword argument:
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,5 @@
 [project]
-authors = [
-  { name = "xarray Developers", email = "xarray@googlegroups.com" },
-]
+authors = [{ name = "xarray Developers", email = "xarray@googlegroups.com" }]
 classifiers = [
   "Development Status :: 5 - Production/Stable",
   "License :: OSI Approved :: Apache Software License",
@@ -22,11 +20,7 @@ name = "xarray"
 readme = "README.md"
 requires-python = ">=3.10"
 
-dependencies = [
-  "numpy>=1.24",
-  "packaging>=23.2",
-  "pandas>=2.1",
-]
+dependencies = ["numpy>=1.24", "packaging>=23.2", "pandas>=2.1"]
 
 # We don't encode minimum requirements here (though if we can write a script to
 # generate the text from `min_deps_check.py`, that's welcome...). We do add
@@ -70,6 +64,7 @@ types = [
   "types-PyYAML",
   "types-Pygments",
   "types-colorama",
+  "types-decorator",
   "types-defusedxml",
   "types-docutils",
   "types-networkx",
@@ -93,10 +88,7 @@ dask = "xarray.namedarray.daskmanager:DaskManager"
 
 [build-system]
 build-backend = "setuptools.build_meta"
-requires = [
-  "setuptools>=42",
-  "setuptools-scm>=7",
-]
+requires = ["setuptools>=42", "setuptools-scm>=7"]
 
 [tool.setuptools]
 packages = ["xarray"]
@@ -120,10 +112,7 @@ exclude_lines = ["pragma: no cover", "if TYPE_CHECKING"]
 
 [tool.mypy]
 enable_error_code = ["ignore-without-code", "redundant-self", "redundant-expr"]
-exclude = [
-  'build',
-  'xarray/util/generate_.*\.py',
-]
+exclude = ['build', 'xarray/util/generate_.*\.py']
 files = "xarray"
 show_error_context = true
 warn_redundant_casts = true
@@ -254,10 +243,7 @@ module = ["xarray.namedarray.*", "xarray.tests.test_namedarray"]
 # reportMissingTypeStubs = false
 
 [tool.ruff]
-extend-exclude = [
-  "doc",
-  "_typed_ops.pyi",
-]
+extend-exclude = ["doc", "_typed_ops.pyi"]
 
 [tool.ruff.lint]
 extend-select = [
@@ -383,3 +369,64 @@ test = "pytest"
 ignore = [
   "PP308", # This option creates a large amount of log lines.
 ]
+
+[tool.typos]
+
+[tool.typos.default]
+extend-ignore-identifiers-re = [
+  # Variable names
+  "nd_.*",
+  ".*_nd",
+  "ba_.*",
+  ".*_ba",
+  "ser_.*",
+  ".*_ser",
+  # Function/class names
+  "NDArray.*",
+  ".*NDArray.*",
+]
+
+[tool.typos.default.extend-words]
+# NumPy function names
+arange = "arange"
+
+# Technical terms
+nd = "nd"
+nin = "nin"
+
+# Variable names
+ba = "ba"
+ser = "ser"
+fo = "fo"
+iy = "iy"
+vart = "vart"
+ede = "ede"
+
+# Organization/Institution names
+Stichting = "Stichting"
+Mathematisch = "Mathematisch"
+
+# People's names
+Soler = "Soler"
+Bruning = "Bruning"
+Tung = "Tung"
+Claus = "Claus"
+Celles = "Celles"
+slowy = "slowy"
+Commun = "Commun"
+
+# Tests
+Ome = "Ome"
+SUR = "SUR"
+Tio = "Tio"
+Ono = "Ono"
+abl = "abl"
+
+# Technical terms
+splitted = "splitted"
+childs = "childs"
+cutted = "cutted"
+LOCA = "LOCA"
+
+[tool.typos.type.jupyter]
+extend-ignore-re = ["\"id\": \".*\""]
diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py
@@ -979,7 +979,7 @@ def reindex(
     """
 
     # TODO: (benbovy - explicit indexes): uncomment?
-    # --> from reindex docstrings: "any mis-matched dimension is simply ignored"
+    # --> from reindex docstrings: "any mismatched dimension is simply ignored"
     # bad_keys = [k for k in indexers if k not in obj._indexes and k not in obj.dims]
     # if bad_keys:
     #     raise ValueError(
diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -1973,8 +1973,8 @@ def reindex_like(
             names to pandas.Index objects, which provides coordinates upon
             which to index the variables in this dataset. The indexes on this
             other object need not be the same as the indexes on this
-            dataset. Any mis-matched index values will be filled in with
-            NaN, and any mis-matched dimension names will simply be ignored.
+            dataset. Any mismatched index values will be filled in with
+            NaN, and any mismatched dimension names will simply be ignored.
         method : {None, "nearest", "pad", "ffill", "backfill", "bfill"}, optional
             Method to use for filling index values from other not found on this
             data array:
@@ -2155,8 +2155,8 @@ def reindex(
         ----------
         indexers : dict, optional
             Dictionary with keys given by dimension names and values given by
-            arrays of coordinates tick labels. Any mis-matched coordinate
-            values will be filled in with NaN, and any mis-matched dimension
+            arrays of coordinates tick labels. Any mismatched coordinate
+            values will be filled in with NaN, and any mismatched dimension
             names will simply be ignored.
             One of indexers or indexers_kwargs must be provided.
         copy : bool, optional
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -3364,8 +3364,8 @@ def reindex_like(
             names to pandas.Index objects, which provides coordinates upon
             which to index the variables in this dataset. The indexes on this
             other object need not be the same as the indexes on this
-            dataset. Any mis-matched index values will be filled in with
-            NaN, and any mis-matched dimension names will simply be ignored.
+            dataset. Any mismatched index values will be filled in with
+            NaN, and any mismatched dimension names will simply be ignored.
         method : {None, "nearest", "pad", "ffill", "backfill", "bfill", None}, optional
             Method to use for filling index values from other not found in this
             dataset:
@@ -3430,8 +3430,8 @@ def reindex(
         ----------
         indexers : dict, optional
             Dictionary with keys given by dimension names and values given by
-            arrays of coordinates tick labels. Any mis-matched coordinate
-            values will be filled in with NaN, and any mis-matched dimension
+            arrays of coordinates tick labels. Any mismatched coordinate
+            values will be filled in with NaN, and any mismatched dimension
             names will simply be ignored.
             One of indexers or indexers_kwargs must be provided.
         method : {None, "nearest", "pad", "ffill", "backfill", "bfill", None}, optional
diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py
@@ -1464,7 +1464,7 @@ def sel(
         if any(ds != dim_size0 for ds in dim_size):
             raise ValueError(
                 "CoordinateTransformIndex only supports advanced (point-wise) indexing "
-                "with xarray.DataArray or xarray.Variable objects of macthing dimensions."
+                "with xarray.DataArray or xarray.Variable objects of matching dimensions."
             )
 
         coord_labels = {
diff --git a/xarray/testing/strategies.py b/xarray/testing/strategies.py
@@ -477,7 +477,7 @@ def unique_subset_of(
     )
 
 
-class CFTimeStategy(st.SearchStrategy):
+class CFTimeStrategy(st.SearchStrategy):
     def __init__(self, min_value, max_value):
         self.min_value = min_value
         self.max_value = max_value
@@ -506,5 +506,5 @@ def do_draw(self, data):
             daysinmonth = date_type(99999, 12, 1).daysinmonth
             min_value = date_type(-99999, 1, 1)
             max_value = date_type(99999, 12, daysinmonth, 23, 59, 59, 999999)
-            strategy = CFTimeStategy(min_value, max_value)
+            strategy = CFTimeStrategy(min_value, max_value)
             return strategy.do_draw(data)
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -2576,7 +2576,7 @@ def test_chunk_encoding_with_dask(self) -> None:
             with self.roundtrip(original) as actual:
                 assert_identical(original, actual)
 
-        # but itermediate unaligned chunks are bad
+        # but intermediate unaligned chunks are bad
         badenc = ds.chunk({"x": (3, 5, 3, 1)})
         badenc.var1.encoding["chunks"] = (3,)
         with pytest.raises(ValueError, match=r"would overlap multiple dask chunks"):
diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py
@@ -1826,7 +1826,7 @@ def test_encode_cf_timedelta_casting_overflow_error(use_dask, dtype) -> None:
 
 _DECODE_TIMEDELTA_TESTS = {
     "default": (True, None, np.dtype("timedelta64[ns]"), True),
-    "decode_timdelta=False": (True, False, np.dtype("int64"), False),
+    "decode_timedelta=False": (True, False, np.dtype("int64"), False),
     "inherit-time_unit-from-decode_times": (
         CFDatetimeCoder(time_unit="s"),
         None,
diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -1243,7 +1243,7 @@ def test_chunk_by_frequency(self, freq: str, calendar: str, add_gap: bool) -> No
         assert rechunked.chunksizes["time"] == expected
         assert rechunked.chunksizes["x"] == (2,) * 5
 
-    def test_chunk_by_frequecy_errors(self):
+    def test_chunk_by_frequency_errors(self):
         ds = Dataset({"foo": ("x", [1, 2, 3])})
         with pytest.raises(ValueError, match="virtual variable"):
             ds.chunk(x=TimeResampler("YE"))
@@ -2204,7 +2204,7 @@ def test_reindex(self) -> None:
 
         # invalid dimension
         # TODO: (benbovy - explicit indexes): uncomment?
-        # --> from reindex docstrings: "any mis-matched dimension is simply ignored"
+        # --> from reindex docstrings: "any mismatched dimension is simply ignored"
         # with pytest.raises(ValueError, match=r"indexer keys.*not correspond.*"):
         #     data.reindex(invalid=0)
 
diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py
@@ -1602,7 +1602,7 @@ def test_filter_like(self) -> None:
         assert filtered_tree.equals(barren_tree)
         assert "flowers" not in filtered_tree.children
 
-        # test symetrical pruning results in isomorphic trees
+        # test symmetrical pruning results in isomorphic trees
         assert flower_tree.filter_like(fruit_tree).isomorphic(
             fruit_tree.filter_like(flower_tree)
         )
diff --git a/xarray/tests/test_rolling.py b/xarray/tests/test_rolling.py
@@ -606,7 +606,7 @@ def test_rolling_construct_automatic_rechunk(self):
 
         # Construct dataset with chunk size of (400, 400, 1) or 1.22 MiB
         da = DataArray(
-            dims=["latitute", "longitude", "time"],
+            dims=["latitude", "longitude", "time"],
             data=dask.array.random.random((400, 400, 400), chunks=(-1, -1, 1)),
             name="foo",
         )
diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py
@@ -868,7 +868,7 @@ def test_getitem_error(self):
 
         v = Variable(["x", "y", "z"], np.arange(60).reshape(3, 4, 5))
         ind = Variable(["x"], [0, 1])
-        with pytest.raises(IndexError, match=r"Dimensions of indexers mis"):
+        with pytest.raises(IndexError, match=r"Dimensions of indexers mismatch"):
             v[:, ind]
 
     @pytest.mark.parametrize(

Original file line number	Diff line number	Diff line change
`@@ -1464,7 +1464,7 @@ def sel(`
`1464`	`1464`	`if any(ds != dim_size0 for ds in dim_size):`
`1465`	`1465`	`raise ValueError(`
`1466`	`1466`	`"CoordinateTransformIndex only supports advanced (point-wise) indexing "`
`1467`		`- "with xarray.DataArray or xarray.Variable objects of macthing dimensions."`
	`1467`	`+ "with xarray.DataArray or xarray.Variable objects of matching dimensions."`
`1468`	`1468`	`)`
`1469`	`1469`
`1470`	`1470`	`coord_labels = {`
Original file line number	Diff line number	Diff line change
`@@ -1602,7 +1602,7 @@ def test_filter_like(self) -> None:`
`1602`	`1602`	`assert filtered_tree.equals(barren_tree)`
`1603`	`1603`	`assert "flowers" not in filtered_tree.children`
`1604`	`1604`
`1605`		`- # test symetrical pruning results in isomorphic trees`
	`1605`	`+ # test symmetrical pruning results in isomorphic trees`
`1606`	`1606`	`assert flower_tree.filter_like(fruit_tree).isomorphic(`
`1607`	`1607`	`fruit_tree.filter_like(flower_tree)`
`1608`	`1608`	`)`
Original file line number	Diff line number	Diff line change
`@@ -606,7 +606,7 @@ def test_rolling_construct_automatic_rechunk(self):`
`606`	`606`
`607`	`607`	`# Construct dataset with chunk size of (400, 400, 1) or 1.22 MiB`
`608`	`608`	`da = DataArray(`
`609`		`- dims=["latitute", "longitude", "time"],`
	`609`	`+ dims=["latitude", "longitude", "time"],`
`610`	`610`	`data=dask.array.random.random((400, 400, 400), chunks=(-1, -1, 1)),`
`611`	`611`	`name="foo",`
`612`	`612`	`)`