Skip first logs based on datetime of the marker

Pijukatel · Pijukatel · commit 4ad39fa1bd5c · 2025-05-15T09:55:08.000+02:00
diff --git a/src/apify_client/clients/resource_clients/log.py b/src/apify_client/clients/resource_clients/log.py
@@ -6,6 +6,7 @@
 import threading
 from asyncio import Task
 from contextlib import asynccontextmanager, contextmanager
+from datetime import datetime, timezone
 from threading import Thread
 from typing import TYPE_CHECKING, Any, cast
 
@@ -209,6 +210,9 @@ class StreamedLog:
     It uses buffer to deal with possibly chunked logs. Chunked logs are stored in buffer. Chunks are expected to contain
     specific markers that indicate the start of the log message. Each time a new chunk with complete split marker
     arrives, the buffer is processed, logged and emptied.
+
+    This works only if the logs have datetime marker in ISO format. For example, `2025-05-12T15:35:59.429Z` This is the
+    default log standard for the actors.
     """
 
     # Test related flag to enable propagation of logs to the `caplog` fixture during tests.
@@ -230,8 +234,8 @@ def __init__(self, to_logger: logging.Logger, *, from_start: bool = True) -> Non
         if self._force_propagate:
             to_logger.propagate = True
         self._stream_buffer = list[str]()
-        self._split_marker = re.compile(r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)')  # Ex:2025-05-12T15:35:59.429Z
-        self._from_start = from_start
+        self._split_marker = re.compile(r'(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z)')
+        self._relevancy_time_limit: datetime | None = None if from_start else datetime.now(tz=timezone.utc)
 
     def _process_new_data(self, data: bytes) -> None:
         new_chunk = data.decode('utf-8')
@@ -258,6 +262,11 @@ def _log_buffer_content(self, *, include_last_part: bool = False) -> None:
             self._stream_buffer = all_parts[-2:]
 
         for marker, content in zip(message_markers, message_contents):
+            if self._relevancy_time_limit:
+                log_time = datetime.fromisoformat(marker.replace('Z', '+00:00'))
+                if log_time < self._relevancy_time_limit:
+                    # Skip irrelevant logs
+                    continue
             message = marker + content
             self._to_logger.log(level=self._guess_log_level_from_message(message), msg=message.strip())
 
@@ -314,12 +323,7 @@ def _stream_log(self) -> None:
         with self._log_client.stream(raw=True) as log_stream:
             if not log_stream:
                 return
-            # The first chunk contains all older logs from the start of the actor run until now.
-            skip_first_chunk = not self._from_start
             for data in log_stream.iter_bytes():
-                if skip_first_chunk:
-                    skip_first_chunk = False
-                    continue
                 self._process_new_data(data)
                 if self._stop_logging:
                     break
@@ -363,12 +367,7 @@ async def _stream_log(self) -> None:
         async with self._log_client.stream(raw=True) as log_stream:
             if not log_stream:
                 return
-            # The first chunk contains all older logs from the start of the actor run until now.
-            skip_first_chunk = not self._from_start
             async for data in log_stream.aiter_bytes():
-                if skip_first_chunk:
-                    skip_first_chunk = False
-                    continue
                 self._process_new_data(data)
 
             # If the stream is finished, then the last part will be also processed.
diff --git a/tests/unit/test_logging.py b/tests/unit/test_logging.py
@@ -3,6 +3,8 @@
 import logging
 import time
 from collections.abc import AsyncIterator, Iterator
+from datetime import datetime
+from unittest.mock import patch
 
 import httpx
 import pytest
@@ -126,7 +128,11 @@ async def test_redirected_logs_async(
     """Test that redirected logs are formatted correctly."""
 
     run_client = ApifyClientAsync(token='mocked_token', api_url=_MOCKED_API_URL).run(run_id=_MOCKED_RUN_ID)
-    streamed_log = await run_client.get_streamed_log(actor_name=_MOCKED_ACTOR_NAME, from_start=log_from_start)
+
+    with patch('apify_client.clients.resource_clients.log.datetime') as mocked_datetime:
+        # Mock `now()` so that it has timestamp bigger than the first 3 logs
+        mocked_datetime.now.return_value = datetime.fromisoformat('2025-05-13T07:24:14.132+00:00')
+        streamed_log = await run_client.get_streamed_log(actor_name=_MOCKED_ACTOR_NAME, from_start=log_from_start)
 
     # Set `propagate=True` during the tests, so that caplog can see the logs..
     logger_name = f'apify.{_MOCKED_ACTOR_NAME}-{_MOCKED_RUN_ID}'
@@ -155,7 +161,11 @@ def test_redirected_logs_sync(
     """Test that redirected logs are formatted correctly."""
 
     run_client = ApifyClient(token='mocked_token', api_url=_MOCKED_API_URL).run(run_id=_MOCKED_RUN_ID)
-    streamed_log = run_client.get_streamed_log(actor_name=_MOCKED_ACTOR_NAME, from_start=log_from_start)
+
+    with patch('apify_client.clients.resource_clients.log.datetime') as mocked_datetime:
+        # Mock `now()` so that it has timestamp bigger than the first 3 logs
+        mocked_datetime.now.return_value = datetime.fromisoformat('2025-05-13T07:24:14.132+00:00')
+        streamed_log = run_client.get_streamed_log(actor_name=_MOCKED_ACTOR_NAME, from_start=log_from_start)
 
     # Set `propagate=True` during the tests, so that caplog can see the logs..
     logger_name = f'apify.{_MOCKED_ACTOR_NAME}-{_MOCKED_RUN_ID}'