RiveryIO · alonalmog82 · Nov 4, 2024 · Nov 4, 2024
@@ -11,14 +11,15 @@
 from .logging import setup_logging, log_with_context
 
 class HighThroughputDirCleaner:
-    def __init__(self, root_path: str, max_age_hours: float, max_workers: int = 8, dry_run: bool = True):
+    def __init__(self, root_path: str, max_age_hours: float, max_workers: int = 8, dry_run: bool = True, chunk_size: int = 1000):
         """Initialize the directory cleaner."""
         self.root_path = Path(root_path)
         self.max_workers = max_workers
         self.dry_run = dry_run
         self.max_age_hours = max_age_hours  # Store the max age
         self.cutoff_time = time.time() - (max_age_hours * 3600)
         self.dirs_queue: queue.Queue = queue.Queue()
+        self.chunk_size = chunk_size
 
         # Statistics
         self.stats = {
@@ -41,13 +42,14 @@ def update_stats(self, **kwargs: int) -> None:
                     self.stats[key] += value
 
     def process_directory(self, directory: Path) -> None:
-        """Process all files in a directory."""
+        """Process all files in a directory and queue them for chunked processing."""
         try:
+            file_paths = []
             with os.scandir(directory) as entries:
                 for entry in entries:
                     try:
                         if entry.is_file(follow_symlinks=False):
-                            self.process_file(Path(entry.path))
+                            file_paths.append(Path(entry.path))
                         elif entry.is_dir(follow_symlinks=False):
                             self.dirs_queue.put(Path(entry.path))
                     except OSError as e:
@@ -57,6 +59,11 @@ def process_directory(self, directory: Path) -> None:
                         })
                         self.update_stats(errors=1)
 
+            # Process files in chunks
+            for i in range(0, len(file_paths), self.chunk_size):
+                chunk = file_paths[i:i+self.chunk_size]
+                self.process_file_chunk(chunk)
+
             self.update_stats(dirs_processed=1)
 
         except OSError as e:
@@ -66,50 +73,43 @@ def process_directory(self, directory: Path) -> None:
             })
             self.update_stats(errors=1)
 
-    def process_file(self, file_path: Path) -> None:
-        """Process a single file and purge if too old."""
-        try:
-            # print(f"Processing: {file_path.name}")  # debug line
-            stat = file_path.lstat() if file_path.is_symlink() else file_path.stat()
-            self.update_stats(files_processed=1)
-
-            # Calculate age for debugging
-            age_hours = (time.time() - stat.st_mtime) / 3600
-            self.logger.info(f"Processing file: {file_path}, "
-                            f"Age: {age_hours:.1f} hours, "
-                            f"Cutoff: {self.max_age_hours} hours, "
-                            f"Mtime: {stat.st_mtime}, "
-                            f"Cutoff time: {self.cutoff_time}")
-
-            if stat.st_mtime < self.cutoff_time:  # Changed comparison
-                self.update_stats(files_to_purge=1)
-
-                if not self.dry_run:
-                    try:
-                        file_path.unlink()
-                        self.update_stats(files_purged=1)
-                        self.logger.info(f"File purged", extra={
+    def process_file_chunk(self, file_paths: list[Path]) -> None:
+        """Process a chunk of files and purge if too old using only file metadata."""
+        for file_path in file_paths:
+            try:
+                stat = file_path.stat()
+                self.update_stats(files_processed=1)
+
+                age_hours = (time.time() - stat.st_mtime) / 3600
+                if stat.st_mtime < self.cutoff_time:
+                    self.update_stats(files_to_purge=1)
+
+                    if not self.dry_run:
+                        try:
+                            file_path.unlink()
+                            self.update_stats(files_purged=1)
+                            self.logger.info(f"File purged", extra={
+                                'file_path': str(file_path),
+                                'age_hours': age_hours
+                            })
+                        except OSError as e:
+                            self.logger.error("Error purging file", extra={
+                                'file_path': str(file_path),
+                                'error': str(e)
+                            })
+                            self.update_stats(errors=1)
+                    else:
+                        self.logger.info("File marked for purging (dry run)", extra={
                             'file_path': str(file_path),
                             'age_hours': age_hours
                         })
-                    except OSError as e:
-                        self.logger.error("Error purging file", extra={
-                            'file_path': str(file_path),
-                            'error': str(e)
-                        })
-                        self.update_stats(errors=1)
-                else:
-                    self.logger.info("File marked for purging (dry run)", extra={
-                        'file_path': str(file_path),
-                        'age_hours': age_hours
-                    })
 
-        except OSError as e:
-            self.logger.error("Error processing file", extra={
-                'file_path': str(file_path),
-                'error': str(e)
-            })
-            self.update_stats(errors=1)
+            except OSError as e:
+                self.logger.error("Error processing file", extra={
+                    'file_path': str(file_path),
+                    'error': str(e)
+                })
+                self.update_stats(errors=1)
 
     def clean(self) -> None:
         """Main cleaning function using ThreadPoolExecutor."""
@@ -118,7 +118,8 @@ def clean(self) -> None:
         log_with_context(self.logger, 'info', f"Starting directory cleanup - {mode} MODE", {
             'root_path': str(self.root_path),
             'max_workers': self.max_workers,
-            'dry_run': self.dry_run
+            'dry_run': self.dry_run,
+            'chunk_size': self.chunk_size
         })
 
         # Start with root directory
@@ -177,13 +178,16 @@ def main() -> None:
                       help='Number of worker threads')
     parser.add_argument('--dry-run', action='store_true',
                       help='Perform a dry run without purging files')
+    parser.add_argument('--chunk-size', type=int, default=1000,
+                      help='Number of files to process in each chunk')
     args = parser.parse_args()
 
     cleaner = HighThroughputDirCleaner(
         root_path=args.path,
         max_age_hours=args.max_age_hours,
         max_workers=args.workers,
-        dry_run=args.dry_run
+        dry_run=args.dry_run,
+        chunk_size=args.chunk_size
     )
     cleaner.clean()