22from __future__ import annotations
33
44import asyncio
5+ import logging
56import signal
67import sys
78import tempfile
1011from contextlib import AsyncExitStack , suppress
1112from datetime import timedelta
1213from functools import partial
13- from logging import getLogger
1414from pathlib import Path
1515from typing import TYPE_CHECKING , Any , AsyncContextManager , Callable , Generic , Literal , Union , cast
1616
3737from crawlee .enqueue_strategy import EnqueueStrategy
3838from crawlee .events import LocalEventManager
3939from crawlee .http_clients import HttpxClient
40+ from crawlee .log_config import CrawleeLogFormatter
4041from crawlee .models import BaseRequestData , DatasetItemsListPage , Request , RequestState
4142from crawlee .sessions import SessionPool
4243from crawlee .statistics import Statistics
5758ErrorHandler = Callable [[TCrawlingContext , Exception ], Awaitable [Union [Request , None ]]]
5859FailedRequestHandler = Callable [[TCrawlingContext , Exception ], Awaitable [None ]]
5960
60- logger = getLogger (__name__ )
61-
6261
6362class BasicCrawlerOptions (TypedDict , Generic [TCrawlingContext ]):
6463 """Copy of the parameter types of `BasicCrawler.__init__` meant for typing forwarded __init__ args in subclasses."""
@@ -77,8 +76,10 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
7776 retry_on_blocked : NotRequired [bool ]
7877 proxy_configuration : NotRequired [ProxyConfiguration ]
7978 statistics : NotRequired [Statistics [StatisticsState ]]
79+ configure_logging : NotRequired [bool ]
8080 _context_pipeline : NotRequired [ContextPipeline [TCrawlingContext ]]
8181 _additional_context_managers : NotRequired [Sequence [AsyncContextManager ]]
82+ _logger : NotRequired [logging .Logger ]
8283
8384
8485class BasicCrawler (Generic [TCrawlingContext ]):
@@ -109,8 +110,10 @@ def __init__(
109110 retry_on_blocked : bool = True ,
110111 proxy_configuration : ProxyConfiguration | None = None ,
111112 statistics : Statistics | None = None ,
113+ configure_logging : bool = True ,
112114 _context_pipeline : ContextPipeline [TCrawlingContext ] | None = None ,
113115 _additional_context_managers : Sequence [AsyncContextManager ] | None = None ,
116+ _logger : logging .Logger | None = None ,
114117 ) -> None :
115118 """Initialize the BasicCrawler.
116119
@@ -134,10 +137,11 @@ def __init__(
134137 retry_on_blocked: If set to True, the crawler will try to automatically bypass any detected bot protection
135138 proxy_configuration: A HTTP proxy configuration to be used for making requests
136139 statistics: A preconfigured `Statistics` instance if you wish to use non-default configuration
137- browser_pool: A preconfigured `BrowserPool` instance for browser crawling.
140+ configure_logging: If set to True, the crawler will configure the logging infrastructure
138141 _context_pipeline: Allows extending the request lifecycle and modifying the crawling context.
139142 This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
140143 _additional_context_managers: Additional context managers to be used in the crawler lifecycle.
144+ _logger: A logger instance passed from a child class to ensure consistent labels
141145 """
142146 self ._router : Router [TCrawlingContext ] | None = None
143147
@@ -186,10 +190,27 @@ def __init__(
186190
187191 self ._retry_on_blocked = retry_on_blocked
188192
193+ if configure_logging :
194+ handler = logging .StreamHandler ()
195+ handler .setFormatter (CrawleeLogFormatter ())
196+
197+ root_logger = logging .getLogger ()
198+
199+ for old_handler in root_logger .handlers [:]:
200+ root_logger .removeHandler (old_handler )
201+
202+ root_logger .addHandler (handler )
203+ root_logger .setLevel (logging .INFO if not sys .flags .dev_mode else logging .DEBUG )
204+
205+ if not _logger :
206+ _logger = logging .getLogger (__name__ )
207+
208+ self ._logger = _logger
209+
189210 self ._proxy_configuration = proxy_configuration
190211 self ._statistics = statistics or Statistics (
191212 event_manager = self ._event_manager ,
192- log_message = f'{ logger .name } request statistics' ,
213+ log_message = f'{ self . _logger .name } request statistics' ,
193214 )
194215 self ._additional_context_managers = _additional_context_managers or []
195216
@@ -235,7 +256,7 @@ async def _get_session(self) -> Session | None:
235256 timeout_message = 'Fetching a session from the pool timed out after '
236257 f'{ self ._internal_timeout .total_seconds ()} seconds' ,
237258 max_retries = 3 ,
238- logger = logger ,
259+ logger = self . _logger ,
239260 )
240261
241262 async def _get_proxy_info (self , request : Request , session : Session | None ) -> ProxyInfo | None :
@@ -316,7 +337,7 @@ def sigint_handler() -> None:
316337
317338 if not interrupted :
318339 interrupted = True
319- logger .info ('Pausing... Press CTRL+C again to force exit.' )
340+ self . _logger .info ('Pausing... Press CTRL+C again to force exit.' )
320341
321342 run_task .cancel ()
322343
@@ -334,19 +355,24 @@ def sigint_handler() -> None:
334355 asyncio .get_running_loop ().remove_signal_handler (signal .SIGINT )
335356
336357 if self ._statistics .error_tracker .total > 0 :
337- logger .info (
358+ self . _logger .info (
338359 'Error analysis:'
339360 f' total_errors={ self ._statistics .error_tracker .total } '
340361 f' unique_errors={ self ._statistics .error_tracker .unique_error_count } '
341362 )
342363
343364 if interrupted :
344- logger .info (f'The crawl was interrupted. To resume, do: CRAWLEE_PURGE_ON_START=0 python { sys .argv [0 ]} ' )
365+ self ._logger .info (
366+ f'The crawl was interrupted. To resume, do: CRAWLEE_PURGE_ON_START=0 python { sys .argv [0 ]} '
367+ )
345368
346369 self ._running = False
347370 self ._has_finished_before = True
348371
349- return self ._statistics .calculate ()
372+ final_statistics = self ._statistics .calculate ()
373+ self ._logger .info (f'Final request statistics: { final_statistics } ' )
374+
375+ return final_statistics
350376
351377 async def _run_crawler (self ) -> None :
352378 async with AsyncExitStack () as exit_stack :
@@ -564,14 +590,14 @@ async def _handle_request_error(self, crawling_context: TCrawlingContext, error:
564590 timeout = self ._internal_timeout ,
565591 timeout_message = 'Marking request as handled timed out after '
566592 f'{ self ._internal_timeout .total_seconds ()} seconds' ,
567- logger = logger ,
593+ logger = self . _logger ,
568594 max_retries = 3 ,
569595 )
570596 await self ._handle_failed_request (crawling_context , error )
571597 self ._statistics .record_request_processing_failure (request .id or request .unique_key )
572598
573599 async def _handle_failed_request (self , crawling_context : TCrawlingContext , error : Exception ) -> None :
574- logger .exception ('Request failed and reached maximum retries' , exc_info = error )
600+ self . _logger .exception ('Request failed and reached maximum retries' , exc_info = error )
575601 self ._statistics .error_tracker .add (error )
576602
577603 if self ._failed_request_handler :
@@ -633,19 +659,19 @@ async def __is_finished_function(self) -> bool:
633659 is_finished = await request_provider .is_finished ()
634660
635661 if self ._max_requests_count_exceeded :
636- logger .info (
662+ self . _logger .info (
637663 f'The crawler has reached its limit of { self ._max_requests_per_crawl } requests per crawl. '
638664 f'All ongoing requests have now completed. Total requests processed: '
639665 f'{ self ._statistics .state .requests_finished } . The crawler will now shut down.'
640666 )
641- logger .info (f'is_finished: { is_finished } ' )
667+ self . _logger .info (f'is_finished: { is_finished } ' )
642668 return True
643669
644670 return is_finished
645671
646672 async def __is_task_ready_function (self ) -> bool :
647673 if self ._max_requests_count_exceeded :
648- logger .info (
674+ self . _logger .info (
649675 f'The crawler has reached its limit of { self ._max_requests_per_crawl } requests per crawl. '
650676 f'The crawler will soon shut down. Ongoing requests will be allowed to complete.'
651677 )
@@ -661,7 +687,7 @@ async def __run_task_function(self) -> None:
661687 lambda : request_provider .fetch_next_request (),
662688 timeout = self ._internal_timeout ,
663689 timeout_message = f'Fetching next request failed after { self ._internal_timeout .total_seconds ()} seconds' ,
664- logger = logger ,
690+ logger = self . _logger ,
665691 max_retries = 3 ,
666692 )
667693
@@ -679,6 +705,7 @@ async def __run_task_function(self) -> None:
679705 send_request = self ._prepare_send_request_function (session , proxy_info ),
680706 add_requests = result .add_requests ,
681707 push_data = self ._push_data ,
708+ log = self ._logger ,
682709 )
683710
684711 statistics_id = request .id or request .unique_key
@@ -692,7 +719,7 @@ async def __run_task_function(self) -> None:
692719 timeout = self ._request_handler_timeout ,
693720 timeout_message = 'Request handler timed out after '
694721 f'{ self ._request_handler_timeout .total_seconds ()} seconds' ,
695- logger = logger ,
722+ logger = self . _logger ,
696723 )
697724
698725 await self ._commit_request_handler_result (crawling_context , result )
@@ -702,7 +729,7 @@ async def __run_task_function(self) -> None:
702729 timeout = self ._internal_timeout ,
703730 timeout_message = 'Marking request as handled timed out after '
704731 f'{ self ._internal_timeout .total_seconds ()} seconds' ,
705- logger = logger ,
732+ logger = self . _logger ,
706733 max_retries = 3 ,
707734 )
708735
@@ -727,15 +754,15 @@ async def __run_task_function(self) -> None:
727754 timeout = self ._internal_timeout ,
728755 timeout_message = 'Handling request failure timed out after '
729756 f'{ self ._internal_timeout .total_seconds ()} seconds' ,
730- logger = logger ,
757+ logger = self . _logger ,
731758 )
732759
733760 request .state = RequestState .DONE
734761 except UserDefinedErrorHandlerError :
735762 request .state = RequestState .ERROR
736763 raise
737764 except Exception as secondary_error :
738- logger .exception (
765+ self . _logger .exception (
739766 'An exception occurred during handling of failed request. This places the crawler '
740767 'and its underlying storages into an unknown state and crawling will be terminated.' ,
741768 exc_info = secondary_error ,
@@ -750,7 +777,7 @@ async def __run_task_function(self) -> None:
750777 raise RuntimeError ('SessionError raised in a crawling context without a session' ) from session_error
751778
752779 if self ._should_retry_request (crawling_context , session_error ):
753- logger .warning ('Encountered a session error, rotating session and retrying' )
780+ self . _logger .warning ('Encountered a session error, rotating session and retrying' )
754781
755782 crawling_context .session .retire ()
756783
@@ -761,33 +788,33 @@ async def __run_task_function(self) -> None:
761788 await request_provider .reclaim_request (request )
762789 self ._statistics .error_tracker_retry .add (session_error )
763790 else :
764- logger .exception ('Request failed and reached maximum retries' , exc_info = session_error )
791+ self . _logger .exception ('Request failed and reached maximum retries' , exc_info = session_error )
765792
766793 await wait_for (
767794 lambda : request_provider .mark_request_as_handled (crawling_context .request ),
768795 timeout = self ._internal_timeout ,
769796 timeout_message = 'Marking request as handled timed out after '
770797 f'{ self ._internal_timeout .total_seconds ()} seconds' ,
771- logger = logger ,
798+ logger = self . _logger ,
772799 max_retries = 3 ,
773800 )
774801
775802 self ._statistics .record_request_processing_failure (statistics_id )
776803 self ._statistics .error_tracker .add (session_error )
777804 except ContextPipelineInterruptedError as interruped_error :
778- logger .debug ('The context pipeline was interrupted' , exc_info = interruped_error )
805+ self . _logger .debug ('The context pipeline was interrupted' , exc_info = interruped_error )
779806
780807 await wait_for (
781808 lambda : request_provider .mark_request_as_handled (crawling_context .request ),
782809 timeout = self ._internal_timeout ,
783810 timeout_message = 'Marking request as handled timed out after '
784811 f'{ self ._internal_timeout .total_seconds ()} seconds' ,
785- logger = logger ,
812+ logger = self . _logger ,
786813 max_retries = 3 ,
787814 )
788815 except ContextPipelineInitializationError as initialization_error :
789816 if self ._should_retry_request (crawling_context , initialization_error ):
790- logger .debug (
817+ self . _logger .debug (
791818 'An exception occurred during the initialization of crawling context, a retry is in order' ,
792819 exc_info = initialization_error ,
793820 )
@@ -797,21 +824,21 @@ async def __run_task_function(self) -> None:
797824 request .state = RequestState .DONE
798825 await request_provider .reclaim_request (request )
799826 else :
800- logger .exception ('Request failed and reached maximum retries' , exc_info = initialization_error )
827+ self . _logger .exception ('Request failed and reached maximum retries' , exc_info = initialization_error )
801828
802829 await wait_for (
803830 lambda : request_provider .mark_request_as_handled (crawling_context .request ),
804831 timeout = self ._internal_timeout ,
805832 timeout_message = 'Marking request as handled timed out after '
806833 f'{ self ._internal_timeout .total_seconds ()} seconds' ,
807- logger = logger ,
834+ logger = self . _logger ,
808835 max_retries = 3 ,
809836 )
810837
811838 if crawling_context .session :
812839 crawling_context .session .mark_bad ()
813840 except Exception as internal_error :
814- logger .exception (
841+ self . _logger .exception (
815842 'An exception occurred during handling of a request. This places the crawler '
816843 'and its underlying storages into an unknown state and crawling will be terminated.' ,
817844 exc_info = internal_error ,
0 commit comments