iterative · shcheklein · May 20, 2025 · May 16, 2025 · May 17, 2025 · ilongin
diff --git a/examples/incremental_processing/delta.py b/examples/incremental_processing/delta.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+"""
+File Generator Script using DataChain Delta
+
+This script demonstrates:
+1. Creating numbered text files in a 'test' directory
+2. Using DataChain's delta flag for incremental dataset processing
+
+Each execution:
+- Creates a new numbered file in the 'test' directory
+- Updates a DataChain dataset to track these files incrementally
+"""
+
+import re
+import time
+
+from utils import generate_next_file
+
+import datachain as dc
+from datachain import C, File
+
+
+def extract_file_number(file: File) -> int:
+    """Extract file number from the filename."""
+    match = re.search(r"file-(\d+)\.txt", file.name)
+    if match:
+        return int(match.group(1))
+    return -1
+
+
+def process_files_with_delta():
+    """
+    Process files in the test directory using DataChain with delta mode.
+    This demonstrates incremental processing - only new files are processed.
+    """
+    chain = (
+        dc.read_storage("test/", update=True, delta=True, delta_on="file.path")
+        .filter(C("file.path").glob("*.txt"))
+        .map(file_number=extract_file_number)
+        .map(content=lambda file: file.read_text())
+        .map(processed_at=lambda: time.strftime("%Y-%m-%d %H:%M:%S"))
+        .save(name="test_files")
+    )
+
+    # Show information about the dataset
+    print(f"\nProcessed files. Total records: {chain.count()}")
+    print("\nDataset versions:")
+    test_dataset = dc.datasets().filter(C("name") == "test_files")
+
+    for version in test_dataset.collect("version"):
+        print(f"- Version: {version}")
+
+    # Show the last 3 records to demonstrate the incremental processing
+    print("\nLatest files processed:")
+    chain.order_by("file_number", descending=True).limit(3).show()
+
+
+if __name__ == "__main__":
+    # Generate a new file
+    new_file = generate_next_file()
+    print(f"Created new file: {new_file}")
+
+    # Process all new file with (delta update)
+    process_files_with_delta()
diff --git a/examples/incremental_processing/utils.py b/examples/incremental_processing/utils.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+"""
+File Generator Helper
+
+This helper creates numbered text files in a 'test' directory each time it runs.
+The files follow the naming pattern: file-0.txt, file-1.txt, file-2.txt, etc.
+
+Each execution, the script:
+
+1. Creates the 'test' directory if it doesn't exist
+2. Finds the highest numbered file currently present
+3. Creates a new file with the next number in sequence
+4. Adds timestamped content to the file
+"""
+
+import re
+import time
+from pathlib import Path
+
+
+def generate_next_file() -> Path:
+    """
+    Generate (appends) a new numbered text file in the 'test' directory.
+    """
+    test_dir = Path("test")
+    test_dir.mkdir(exist_ok=True)
+
+    max_num = -1
+    for file in test_dir.glob("file-*.txt"):
+        if file.is_file():
+            match = re.search(r"file-(\d+)\.txt", file.name)
+            if match:
+                max_num = max(max_num, int(match.group(1)))
+
+    next_num = max_num + 1
+    new_file_path = test_dir / f"file-{next_num}.txt"
+    timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
+    content = f"This is file number {next_num}\nCreated at: {timestamp}\n"
+    new_file_path.write_text(content)
+
+    return new_file_path
diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py
@@ -12,6 +12,10 @@
 
 multimodal_examples = sorted(glob.glob("examples/multimodal/**/*.py", recursive=True))
 
+incremental_processing_examples = sorted(
+    glob.glob("examples/incremental_processing/delta.py", recursive=True)
+)
+
 computer_vision_examples = sorted(
     [
         filename
@@ -86,6 +90,13 @@ def test_multimodal(example):
     )
 
 
+@pytest.mark.examples
+@pytest.mark.incremental_processing
+@pytest.mark.parametrize("example", incremental_processing_examples)
+def test_incremental_processing_examples(example):
+    smoke_test(example)
+
+
 @pytest.mark.examples
 @pytest.mark.computer_vision
 @pytest.mark.parametrize("example", computer_vision_examples)