Lightning-AI · Borda · Apr 8, 2025 · Apr 7, 2025 · Apr 7, 2025 · Apr 8, 2025
@@ -40,7 +40,7 @@ repos:
       - id: detect-private-key
 
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.3.0
+    rev: v2.4.1
     hooks:
       - id: codespell
         additional_dependencies: [tomli]
@@ -51,14 +51,14 @@ repos:
         #args: ["--write-changes"] # uncomment if you want to get automatic fixing
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.8.6
+    rev: v0.11.4
     hooks:
       - id: ruff
         args: ["--fix"]
       - id: ruff-format
 
   - repo: https://github.com/executablebooks/mdformat
-    rev: 0.7.21
+    rev: 0.7.22
     hooks:
       - id: mdformat
         additional_dependencies:
@@ -77,11 +77,11 @@ repos:
         args: ["--print-width=120"]
 
   - repo: https://github.com/tox-dev/pyproject-fmt
-    rev: v2.5.0
+    rev: v2.5.1
     hooks:
       - id: pyproject-fmt
         additional_dependencies: [tox]
   - repo: https://github.com/abravalheri/validate-pyproject
-    rev: v0.23
+    rev: v0.24.1
     hooks:
       - id: validate-pyproject
@@ -1075,7 +1075,7 @@ def __init__(
         self.item_loader = item_loader
         self.storage_options = storage_options
 
-        self.state_dict = state_dict or {rank: 0 for rank in range(self.num_workers)}
+        self.state_dict = state_dict or dict.fromkeys(range(self.num_workers), 0)
 
         if self.reader is not None and self.weights is not None:
             raise ValueError("Either the reader or the weights needs to be defined.")

@@ -473,7 +473,7 @@ def optimize(
             num_workers = 1
 
         num_workers = num_workers or _get_default_num_workers()
-        state_dict = {rank: 0 for rank in range(num_workers)}
+        state_dict = dict.fromkeys(range(num_workers), 0)
 
         existing_index_file_content = (
             read_index_file_content(_output_dir, storage_options) if mode == "append" else None

@@ -264,5 +264,5 @@ def _get_fs_provider(remote_filepath: str, storage_options: Optional[Dict[str, A
 
 def not_supported_provider(remote_filepath: str) -> bool:
     raise ValueError(
-        f"URL should start with one of {[el + '://' for el in _SUPPORTED_PROVIDERS]}." f"Found {remote_filepath}."
+        f"URL should start with one of {[el + '://' for el in _SUPPORTED_PROVIDERS]}.Found {remote_filepath}."
     )
@@ -32,7 +32,7 @@ def _convert_bytes_to_int(bytes_str: str) -> int:
             except ValueError:
                 raise ValueError(
                     f"Unsupported value/suffix {bytes_str}. Supported suffix are "
-                    f'{["b"] + list(_FORMAT_TO_RATIO.keys())}.'
+                    f"{['b'] + list(_FORMAT_TO_RATIO.keys())}."
                 )
     raise ValueError(f"The supported units are {_FORMAT_TO_RATIO.keys()}")
 

@@ -40,7 +40,7 @@ def _pack_greedily(items: List[Any], weights: List[int], num_bins: int) -> Tuple
 
     sorted_items_and_weights = sorted(zip(items, weights), key=lambda x: x[1], reverse=True)
     bin_contents = defaultdict(list)
-    bin_weights = {i: 0 for i in range(num_bins)}
+    bin_weights = dict.fromkeys(range(num_bins), 0)
 
     for item, weight in sorted_items_and_weights:
         min_bin_id = min(bin_weights, key=(lambda x: bin_weights[x]), default=0)

@@ -511,11 +511,11 @@ def test_optimize_race_condition(tmpdir):
     ]
 
     for i, url in enumerate(urls):
-        print(f"downloading {i+1} file")
+        print(f"downloading {i + 1} file")
         with requests.get(url, stream=True, timeout=10) as r:
             r.raise_for_status()  # Raise an exception for bad status codes
 
-            with open(f"{tmpdir}/custom_texts/book{i+1}.txt", "wb") as f:
+            with open(f"{tmpdir}/custom_texts/book{i + 1}.txt", "wb") as f:
                 for chunk in r.iter_content(chunk_size=8192):
                     f.write(chunk)
 

@@ -228,7 +228,7 @@ def __len__(self) -> int:
     os.makedirs(os.path.join(tmpdir, "cache_2"), exist_ok=True)
     dataset = RandomDatasetAtRuntime(64, 64)
     dataloader = CacheDataLoader(dataset, cache_dir=os.path.join(tmpdir, "cache_2"), chunk_bytes=2 << 12)
-    with pytest.raises(ValueError, match="Your dataset items aren't deterministic"):  # noqa: PT012
+    with pytest.raises(ValueError, match="Your dataset items aren't deterministic"):
         for _ in dataloader:
             pass
 

@@ -552,9 +552,9 @@ def test_combined_dataset_dataloader_states_partial_iterations(combined_dataset,
         if batch_idx == break_at:
             break
 
-    assert (
-        not dataloader.restore
-    ), "Dataloader should not be in restore state after partial iteration, before loading state."
+    assert not dataloader.restore, (
+        "Dataloader should not be in restore state after partial iteration, before loading state."
+    )
     dataloader.load_state_dict(dataloader.state_dict())
     assert dataloader.restore, "Dataloader should be in restore state after loading the state from a partial iteration."
 
@@ -564,9 +564,9 @@ def test_combined_dataset_dataloader_states_partial_iterations(combined_dataset,
         assert dataloader.current_epoch == 1, "Current epoch should be 1 during restore"
         count += 1
     expected_batches = total_batches - break_at - 1
-    assert (
-        count >= expected_batches
-    ), f"There should be at least{expected_batches} remaining batches in the first epoch."
+    assert count >= expected_batches, (
+        f"There should be at least{expected_batches} remaining batches in the first epoch."
+    )
     assert not dataloader.restore, "Dataloader should not be in restore state after completing first epoch."
 
     # Verify batches in the second epoch

@@ -150,9 +150,9 @@ def test_streaming_dataset_max_cache_dir(tmpdir, caplog):
         StreamingDataset(input_dir=str(tmpdir), max_cache_size="10GB")
         StreamingDataset(input_dir=str(tmpdir), max_cache_size="20GB")
     assert len(caplog.messages) == 4
-    assert all(
-        "The provided `max_cache_size` is less than 25GB." in record.message for record in caplog.records
-    ), "Expected warning about the `max_cache_size` being less than 25GB was not logged"
+    assert all("The provided `max_cache_size` is less than 25GB." in record.message for record in caplog.records), (
+        "Expected warning about the `max_cache_size` being less than 25GB was not logged"
+    )
 
 
 @pytest.mark.parametrize("drop_last", [False, True])
@@ -1540,9 +1540,9 @@ def test_dataset_as_iterator_and_non_iterator(tmpdir, local, shuffle):
         assert data is not None
         if local and i < dataset_length - 1:
             # In iterator mode with local or remote data, _chunks_queued_for_download should be enabled
-            assert (
-                dataset.cache._reader._chunks_queued_for_download is True
-            ), "_chunks_queued_for_download should be enabled during iteration"
+            assert dataset.cache._reader._chunks_queued_for_download is True, (
+                "_chunks_queued_for_download should be enabled during iteration"
+            )
         else:
             assert dataset.cache._reader._chunks_queued_for_download is False, (
                 "_chunks_queued_for_download should be disabled when used as local dir without `local:` prefix"