Skip to content

Commit a67b72f

Browse files
authored
feat: automatic logging setup (#229)
- closes #214
1 parent 237ec78 commit a67b72f

8 files changed

Lines changed: 83 additions & 29 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
- BREAKING: `BasicCrawler.export_data` helper method which replaces `BasicCrawler.export_to`
88
- `Configuration.get_global_configuration` method
9+
- Automatic logging setup
10+
- Context helper for logging (`context.log`)
911

1012
### Fixes
1113

src/crawlee/basic_crawler/basic_crawler.py

Lines changed: 56 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from __future__ import annotations
33

44
import asyncio
5+
import logging
56
import signal
67
import sys
78
import tempfile
@@ -10,7 +11,6 @@
1011
from contextlib import AsyncExitStack, suppress
1112
from datetime import timedelta
1213
from functools import partial
13-
from logging import getLogger
1414
from pathlib import Path
1515
from typing import TYPE_CHECKING, Any, AsyncContextManager, Callable, Generic, Literal, Union, cast
1616

@@ -37,6 +37,7 @@
3737
from crawlee.enqueue_strategy import EnqueueStrategy
3838
from crawlee.events import LocalEventManager
3939
from crawlee.http_clients import HttpxClient
40+
from crawlee.log_config import CrawleeLogFormatter
4041
from crawlee.models import BaseRequestData, DatasetItemsListPage, Request, RequestState
4142
from crawlee.sessions import SessionPool
4243
from crawlee.statistics import Statistics
@@ -57,8 +58,6 @@
5758
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Union[Request, None]]]
5859
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
5960

60-
logger = getLogger(__name__)
61-
6261

6362
class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
6463
"""Copy of the parameter types of `BasicCrawler.__init__` meant for typing forwarded __init__ args in subclasses."""
@@ -77,8 +76,10 @@ class BasicCrawlerOptions(TypedDict, Generic[TCrawlingContext]):
7776
retry_on_blocked: NotRequired[bool]
7877
proxy_configuration: NotRequired[ProxyConfiguration]
7978
statistics: NotRequired[Statistics[StatisticsState]]
79+
configure_logging: NotRequired[bool]
8080
_context_pipeline: NotRequired[ContextPipeline[TCrawlingContext]]
8181
_additional_context_managers: NotRequired[Sequence[AsyncContextManager]]
82+
_logger: NotRequired[logging.Logger]
8283

8384

8485
class BasicCrawler(Generic[TCrawlingContext]):
@@ -109,8 +110,10 @@ def __init__(
109110
retry_on_blocked: bool = True,
110111
proxy_configuration: ProxyConfiguration | None = None,
111112
statistics: Statistics | None = None,
113+
configure_logging: bool = True,
112114
_context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
113115
_additional_context_managers: Sequence[AsyncContextManager] | None = None,
116+
_logger: logging.Logger | None = None,
114117
) -> None:
115118
"""Initialize the BasicCrawler.
116119
@@ -134,10 +137,11 @@ def __init__(
134137
retry_on_blocked: If set to True, the crawler will try to automatically bypass any detected bot protection
135138
proxy_configuration: A HTTP proxy configuration to be used for making requests
136139
statistics: A preconfigured `Statistics` instance if you wish to use non-default configuration
137-
browser_pool: A preconfigured `BrowserPool` instance for browser crawling.
140+
configure_logging: If set to True, the crawler will configure the logging infrastructure
138141
_context_pipeline: Allows extending the request lifecycle and modifying the crawling context.
139142
This parameter is meant to be used by child classes, not when BasicCrawler is instantiated directly.
140143
_additional_context_managers: Additional context managers to be used in the crawler lifecycle.
144+
_logger: A logger instance passed from a child class to ensure consistent labels
141145
"""
142146
self._router: Router[TCrawlingContext] | None = None
143147

@@ -186,10 +190,27 @@ def __init__(
186190

187191
self._retry_on_blocked = retry_on_blocked
188192

193+
if configure_logging:
194+
handler = logging.StreamHandler()
195+
handler.setFormatter(CrawleeLogFormatter())
196+
197+
root_logger = logging.getLogger()
198+
199+
for old_handler in root_logger.handlers[:]:
200+
root_logger.removeHandler(old_handler)
201+
202+
root_logger.addHandler(handler)
203+
root_logger.setLevel(logging.INFO if not sys.flags.dev_mode else logging.DEBUG)
204+
205+
if not _logger:
206+
_logger = logging.getLogger(__name__)
207+
208+
self._logger = _logger
209+
189210
self._proxy_configuration = proxy_configuration
190211
self._statistics = statistics or Statistics(
191212
event_manager=self._event_manager,
192-
log_message=f'{logger.name} request statistics',
213+
log_message=f'{self._logger.name} request statistics',
193214
)
194215
self._additional_context_managers = _additional_context_managers or []
195216

@@ -235,7 +256,7 @@ async def _get_session(self) -> Session | None:
235256
timeout_message='Fetching a session from the pool timed out after '
236257
f'{self._internal_timeout.total_seconds()} seconds',
237258
max_retries=3,
238-
logger=logger,
259+
logger=self._logger,
239260
)
240261

241262
async def _get_proxy_info(self, request: Request, session: Session | None) -> ProxyInfo | None:
@@ -316,7 +337,7 @@ def sigint_handler() -> None:
316337

317338
if not interrupted:
318339
interrupted = True
319-
logger.info('Pausing... Press CTRL+C again to force exit.')
340+
self._logger.info('Pausing... Press CTRL+C again to force exit.')
320341

321342
run_task.cancel()
322343

@@ -334,19 +355,24 @@ def sigint_handler() -> None:
334355
asyncio.get_running_loop().remove_signal_handler(signal.SIGINT)
335356

336357
if self._statistics.error_tracker.total > 0:
337-
logger.info(
358+
self._logger.info(
338359
'Error analysis:'
339360
f' total_errors={self._statistics.error_tracker.total}'
340361
f' unique_errors={self._statistics.error_tracker.unique_error_count}'
341362
)
342363

343364
if interrupted:
344-
logger.info(f'The crawl was interrupted. To resume, do: CRAWLEE_PURGE_ON_START=0 python {sys.argv[0]}')
365+
self._logger.info(
366+
f'The crawl was interrupted. To resume, do: CRAWLEE_PURGE_ON_START=0 python {sys.argv[0]}'
367+
)
345368

346369
self._running = False
347370
self._has_finished_before = True
348371

349-
return self._statistics.calculate()
372+
final_statistics = self._statistics.calculate()
373+
self._logger.info(f'Final request statistics: {final_statistics}')
374+
375+
return final_statistics
350376

351377
async def _run_crawler(self) -> None:
352378
async with AsyncExitStack() as exit_stack:
@@ -564,14 +590,14 @@ async def _handle_request_error(self, crawling_context: TCrawlingContext, error:
564590
timeout=self._internal_timeout,
565591
timeout_message='Marking request as handled timed out after '
566592
f'{self._internal_timeout.total_seconds()} seconds',
567-
logger=logger,
593+
logger=self._logger,
568594
max_retries=3,
569595
)
570596
await self._handle_failed_request(crawling_context, error)
571597
self._statistics.record_request_processing_failure(request.id or request.unique_key)
572598

573599
async def _handle_failed_request(self, crawling_context: TCrawlingContext, error: Exception) -> None:
574-
logger.exception('Request failed and reached maximum retries', exc_info=error)
600+
self._logger.exception('Request failed and reached maximum retries', exc_info=error)
575601
self._statistics.error_tracker.add(error)
576602

577603
if self._failed_request_handler:
@@ -633,19 +659,19 @@ async def __is_finished_function(self) -> bool:
633659
is_finished = await request_provider.is_finished()
634660

635661
if self._max_requests_count_exceeded:
636-
logger.info(
662+
self._logger.info(
637663
f'The crawler has reached its limit of {self._max_requests_per_crawl} requests per crawl. '
638664
f'All ongoing requests have now completed. Total requests processed: '
639665
f'{self._statistics.state.requests_finished}. The crawler will now shut down.'
640666
)
641-
logger.info(f'is_finished: {is_finished}')
667+
self._logger.info(f'is_finished: {is_finished}')
642668
return True
643669

644670
return is_finished
645671

646672
async def __is_task_ready_function(self) -> bool:
647673
if self._max_requests_count_exceeded:
648-
logger.info(
674+
self._logger.info(
649675
f'The crawler has reached its limit of {self._max_requests_per_crawl} requests per crawl. '
650676
f'The crawler will soon shut down. Ongoing requests will be allowed to complete.'
651677
)
@@ -661,7 +687,7 @@ async def __run_task_function(self) -> None:
661687
lambda: request_provider.fetch_next_request(),
662688
timeout=self._internal_timeout,
663689
timeout_message=f'Fetching next request failed after {self._internal_timeout.total_seconds()} seconds',
664-
logger=logger,
690+
logger=self._logger,
665691
max_retries=3,
666692
)
667693

@@ -679,6 +705,7 @@ async def __run_task_function(self) -> None:
679705
send_request=self._prepare_send_request_function(session, proxy_info),
680706
add_requests=result.add_requests,
681707
push_data=self._push_data,
708+
log=self._logger,
682709
)
683710

684711
statistics_id = request.id or request.unique_key
@@ -692,7 +719,7 @@ async def __run_task_function(self) -> None:
692719
timeout=self._request_handler_timeout,
693720
timeout_message='Request handler timed out after '
694721
f'{self._request_handler_timeout.total_seconds()} seconds',
695-
logger=logger,
722+
logger=self._logger,
696723
)
697724

698725
await self._commit_request_handler_result(crawling_context, result)
@@ -702,7 +729,7 @@ async def __run_task_function(self) -> None:
702729
timeout=self._internal_timeout,
703730
timeout_message='Marking request as handled timed out after '
704731
f'{self._internal_timeout.total_seconds()} seconds',
705-
logger=logger,
732+
logger=self._logger,
706733
max_retries=3,
707734
)
708735

@@ -727,15 +754,15 @@ async def __run_task_function(self) -> None:
727754
timeout=self._internal_timeout,
728755
timeout_message='Handling request failure timed out after '
729756
f'{self._internal_timeout.total_seconds()} seconds',
730-
logger=logger,
757+
logger=self._logger,
731758
)
732759

733760
request.state = RequestState.DONE
734761
except UserDefinedErrorHandlerError:
735762
request.state = RequestState.ERROR
736763
raise
737764
except Exception as secondary_error:
738-
logger.exception(
765+
self._logger.exception(
739766
'An exception occurred during handling of failed request. This places the crawler '
740767
'and its underlying storages into an unknown state and crawling will be terminated.',
741768
exc_info=secondary_error,
@@ -750,7 +777,7 @@ async def __run_task_function(self) -> None:
750777
raise RuntimeError('SessionError raised in a crawling context without a session') from session_error
751778

752779
if self._should_retry_request(crawling_context, session_error):
753-
logger.warning('Encountered a session error, rotating session and retrying')
780+
self._logger.warning('Encountered a session error, rotating session and retrying')
754781

755782
crawling_context.session.retire()
756783

@@ -761,33 +788,33 @@ async def __run_task_function(self) -> None:
761788
await request_provider.reclaim_request(request)
762789
self._statistics.error_tracker_retry.add(session_error)
763790
else:
764-
logger.exception('Request failed and reached maximum retries', exc_info=session_error)
791+
self._logger.exception('Request failed and reached maximum retries', exc_info=session_error)
765792

766793
await wait_for(
767794
lambda: request_provider.mark_request_as_handled(crawling_context.request),
768795
timeout=self._internal_timeout,
769796
timeout_message='Marking request as handled timed out after '
770797
f'{self._internal_timeout.total_seconds()} seconds',
771-
logger=logger,
798+
logger=self._logger,
772799
max_retries=3,
773800
)
774801

775802
self._statistics.record_request_processing_failure(statistics_id)
776803
self._statistics.error_tracker.add(session_error)
777804
except ContextPipelineInterruptedError as interruped_error:
778-
logger.debug('The context pipeline was interrupted', exc_info=interruped_error)
805+
self._logger.debug('The context pipeline was interrupted', exc_info=interruped_error)
779806

780807
await wait_for(
781808
lambda: request_provider.mark_request_as_handled(crawling_context.request),
782809
timeout=self._internal_timeout,
783810
timeout_message='Marking request as handled timed out after '
784811
f'{self._internal_timeout.total_seconds()} seconds',
785-
logger=logger,
812+
logger=self._logger,
786813
max_retries=3,
787814
)
788815
except ContextPipelineInitializationError as initialization_error:
789816
if self._should_retry_request(crawling_context, initialization_error):
790-
logger.debug(
817+
self._logger.debug(
791818
'An exception occurred during the initialization of crawling context, a retry is in order',
792819
exc_info=initialization_error,
793820
)
@@ -797,21 +824,21 @@ async def __run_task_function(self) -> None:
797824
request.state = RequestState.DONE
798825
await request_provider.reclaim_request(request)
799826
else:
800-
logger.exception('Request failed and reached maximum retries', exc_info=initialization_error)
827+
self._logger.exception('Request failed and reached maximum retries', exc_info=initialization_error)
801828

802829
await wait_for(
803830
lambda: request_provider.mark_request_as_handled(crawling_context.request),
804831
timeout=self._internal_timeout,
805832
timeout_message='Marking request as handled timed out after '
806833
f'{self._internal_timeout.total_seconds()} seconds',
807-
logger=logger,
834+
logger=self._logger,
808835
max_retries=3,
809836
)
810837

811838
if crawling_context.session:
812839
crawling_context.session.mark_bad()
813840
except Exception as internal_error:
814-
logger.exception(
841+
self._logger.exception(
815842
'An exception occurred during handling of a request. This places the crawler '
816843
'and its underlying storages into an unknown state and crawling will be terminated.',
817844
exc_info=internal_error,

src/crawlee/basic_crawler/types.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# ruff: noqa: TCH003
22
from __future__ import annotations
33

4+
import logging
45
import re
56
from collections.abc import Coroutine, Sequence
67
from dataclasses import dataclass, field
@@ -124,6 +125,7 @@ class BasicCrawlingContext:
124125
send_request: SendRequestFunction
125126
add_requests: AddRequestsFunction
126127
push_data: PushDataFunction
128+
log: logging.Logger
127129

128130

129131
class AddRequestsFunctionCall(AddRequestsKwargs):

src/crawlee/beautifulsoup_crawler/beautifulsoup_crawler.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import asyncio
4+
import logging
45
from typing import TYPE_CHECKING, Any, AsyncGenerator, Iterable, Literal
56

67
from bs4 import BeautifulSoup, Tag
@@ -59,6 +60,8 @@ def __init__(
5960
),
6061
)
6162

63+
kwargs.setdefault('_logger', logging.getLogger(__name__))
64+
6265
super().__init__(**kwargs)
6366

6467
async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
@@ -76,6 +79,7 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
7679
add_requests=context.add_requests,
7780
send_request=context.send_request,
7881
push_data=context.push_data,
82+
log=context.log,
7983
http_response=result.http_response,
8084
)
8185

@@ -138,6 +142,7 @@ async def enqueue_links(
138142
add_requests=context.add_requests,
139143
send_request=context.send_request,
140144
push_data=context.push_data,
145+
log=context.log,
141146
http_response=context.http_response,
142147
soup=soup,
143148
)

src/crawlee/http_crawler/http_crawler.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import logging
34
from typing import TYPE_CHECKING, AsyncGenerator, Iterable
45

56
from typing_extensions import Unpack
@@ -45,6 +46,8 @@ def __init__(
4546
),
4647
)
4748

49+
kwargs.setdefault('_logger', logging.getLogger(__name__))
50+
4851
super().__init__(**kwargs)
4952

5053
async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenerator[HttpCrawlingContext, None]:
@@ -62,6 +65,7 @@ async def _make_http_request(self, context: BasicCrawlingContext) -> AsyncGenera
6265
add_requests=context.add_requests,
6366
send_request=context.send_request,
6467
push_data=context.push_data,
68+
log=context.log,
6569
http_response=result.http_response,
6670
)
6771

0 commit comments

Comments
 (0)