Merge pull request #77 from python-thread/perf/return-generator

caffeine-addictt · web-flow · commit 1a6a8b889f33 · 2024-04-28T14:10:37.000+08:00
Make chunking more memory efficient
diff --git a/src/thread/utils/algorithm.py b/src/thread/utils/algorithm.py
@@ -8,10 +8,12 @@
   |_ b.py
 """
 
-from typing import List, Tuple
+from typing import Tuple, Generator
 
 
-def chunk_split(dataset_length: int, number_of_chunks: int) -> List[Tuple[int, int]]:
+def chunk_split(
+    dataset_length: int, number_of_chunks: int
+) -> Generator[Tuple[int, int], None, None]:
     """
     Splits a dataset into balanced chunks
 
@@ -27,7 +29,7 @@ def chunk_split(dataset_length: int, number_of_chunks: int) -> List[Tuple[int, i
 
     Returns
     -------
-    :returns list[tuple[int, int]]: The chunked dataset slices
+    :returns Generator[tuple[int, int], None, None]: The chunked dataset slices
 
     Raises
     ------
@@ -41,13 +43,10 @@ def chunk_split(dataset_length: int, number_of_chunks: int) -> List[Tuple[int, i
     overflow = dataset_length % number_of_chunks
 
     i = 0
-    split = []
     while i < dataset_length:
         chunk_length = chunk_count + int(overflow > 0)
         b = i + chunk_length
 
-        split.append((i, b))
+        yield (i, b)
         overflow -= 1
         i = b
-
-    return split
diff --git a/tests/test_algorithm.py b/tests/test_algorithm.py
@@ -1,17 +1,23 @@
 import random
+from typing import Generator
+
 from src.thread.utils import algorithm
 
 
+def test_type():
+    assert isinstance(algorithm.chunk_split(5, 1), Generator)
+
+
 def test_chunking_1():
-    assert algorithm.chunk_split(5, 1) == [(0, 5)]
+    assert list(algorithm.chunk_split(5, 1)) == [(0, 5)]
 
 
 def test_chunking_2():
-    assert algorithm.chunk_split(5, 2) == [(0, 3), (3, 5)]
+    assert list(algorithm.chunk_split(5, 2)) == [(0, 3), (3, 5)]
 
 
 def test_chunking_3():
-    assert algorithm.chunk_split(100, 8) == [
+    assert list(algorithm.chunk_split(100, 8)) == [
         (0, 13),
         (13, 26),
         (26, 39),
@@ -31,15 +37,13 @@ def test_chunking_dynamic():
     expected_chunk_high = dataset_length % thread_count
 
     i = 0
-    heap = []
+    gen = algorithm.chunk_split(dataset_length, thread_count)
     while i < dataset_length:
         chunk_length = expected_chunk_length_low + int(expected_chunk_high > 0)
         b = i + chunk_length
 
-        heap.append((i, b))
+        assert (
+            next(gen) == (i, b)
+        ), f'\nIndex: {i}\nLength: {dataset_length}\nThreads: {thread_count}\nExpected: {(i, b)}\nActual: {next(gen)}'
         expected_chunk_high -= 1
         i = b
-
-    assert (
-        algorithm.chunk_split(dataset_length, thread_count) == heap
-    ), f'\nLength: {dataset_length}\nThreads: {thread_count}\nExpected: {heap}\nActual: {algorithm.chunk_split(dataset_length, thread_count)}'