Skip to content

Commit c62e554

Browse files
janbucharB4nan
andauthored
feat: Memory usage limit configuration via environment variables (#502)
Co-authored-by: Martin Adámek <banan23@gmail.com>
1 parent d68c674 commit c62e554

5 files changed

Lines changed: 37 additions & 22 deletions

File tree

src/crawlee/_autoscaling/snapshotter.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,11 @@ def __init__(
3838
client_snapshot_interval: timedelta = timedelta(milliseconds=1000),
3939
max_used_cpu_ratio: float = 0.95,
4040
max_memory_size: ByteSize | None = None,
41-
max_used_memory_ratio: float = 0.7,
41+
max_used_memory_ratio: float = 0.9,
4242
max_event_loop_delay: timedelta = timedelta(milliseconds=50),
4343
max_client_errors: int = 1,
4444
snapshot_history: timedelta = timedelta(seconds=30),
45+
available_memory_ratio: float | None = None,
4546
reserve_memory_ratio: float = 0.5,
4647
memory_warning_cooldown_period: timedelta = timedelta(milliseconds=10000),
4748
client_rate_limit_error_retry_count: int = 2,
@@ -74,6 +75,8 @@ def __init__(
7475
7576
snapshot_history: Sets the time interval for which the snapshots are kept.
7677
78+
available_memory_ratio: How big part of the system memory should be used if `max_memory_size` is not given.
79+
7780
reserve_memory_ratio: Fraction of memory kept in reserve. Used to calculate critical memory overload
7881
threshold.
7982
@@ -83,6 +86,9 @@ def __init__(
8386
client_rate_limit_error_retry_count: Number of retries for a client request before considering it a failure
8487
due to rate limiting.
8588
"""
89+
if available_memory_ratio is None and max_memory_size is None:
90+
raise ValueError('At least one of `available_memory_ratio` or `max_memory_size` must be specified')
91+
8692
self._event_manager = event_manager
8793
self._event_loop_snapshot_interval = event_loop_snapshot_interval
8894
self._client_snapshot_interval = client_snapshot_interval
@@ -94,7 +100,9 @@ def __init__(
94100
self._reserve_memory_ratio = reserve_memory_ratio
95101
self._memory_warning_cooldown_period = memory_warning_cooldown_period
96102
self._client_rate_limit_error_retry_count = client_rate_limit_error_retry_count
97-
self._max_memory_size = max_memory_size or self._get_default_max_memory_size()
103+
self._max_memory_size = max_memory_size or self._get_default_max_memory_size(
104+
cast(float, available_memory_ratio)
105+
)
98106

99107
self._cpu_snapshots: list[CpuSnapshot] = []
100108
self._event_loop_snapshots: list[EventLoopSnapshot] = []
@@ -107,9 +115,9 @@ def __init__(
107115
self._timestamp_of_last_memory_warning: datetime = datetime.now(timezone.utc) - timedelta(hours=1)
108116

109117
@staticmethod
110-
def _get_default_max_memory_size() -> ByteSize:
118+
def _get_default_max_memory_size(available_memory_ratio: float) -> ByteSize:
111119
"""Default `memory_max_size` is 1/4 of the total system memory."""
112-
max_memory_size_in_bytes = int(psutil.virtual_memory().total * 0.25)
120+
max_memory_size_in_bytes = int(psutil.virtual_memory().total * available_memory_ratio)
113121
max_memory_size = ByteSize(max_memory_size_in_bytes)
114122
logger.info(f'Setting max_memory_size of this run to {max_memory_size}.')
115123
return max_memory_size

src/crawlee/basic_crawler/_basic_crawler.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from crawlee._log_config import CrawleeLogFormatter
2626
from crawlee._request import BaseRequestData, Request, RequestState
2727
from crawlee._types import BasicCrawlingContext, HttpHeaders, RequestHandlerRunResult, SendRequestFunction
28+
from crawlee._utils.byte_size import ByteSize
2829
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
2930
from crawlee._utils.wait import wait_for
3031
from crawlee.basic_crawler._context_pipeline import ContextPipeline
@@ -179,7 +180,13 @@ def __init__(
179180
self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name)
180181

181182
self._event_manager = event_manager or service_container.get_event_manager()
182-
self._snapshotter = Snapshotter(self._event_manager)
183+
self._snapshotter = Snapshotter(
184+
self._event_manager,
185+
max_memory_size=ByteSize.from_mb(self._configuration.memory_mbytes)
186+
if self._configuration.memory_mbytes
187+
else None,
188+
available_memory_ratio=self._configuration.available_memory_ratio,
189+
)
183190
self._pool = AutoscaledPool(
184191
system_status=SystemStatus(self._snapshotter),
185192
is_finished_function=self.__is_finished_function,

src/crawlee/configuration.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,14 +157,14 @@ class Configuration(BaseSettings):
157157
] = None
158158

159159
available_memory_ratio: Annotated[
160-
float | None,
160+
float,
161161
Field(
162162
validation_alias=AliasChoices(
163163
'apify_available_memory_ratio',
164164
'crawlee_available_memory_ratio',
165165
)
166166
),
167-
] = None
167+
] = 0.25
168168

169169
storage_dir: Annotated[
170170
str,

tests/unit/_autoscaling/test_snapshotter.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
@pytest.fixture
1919
def snapshotter() -> Snapshotter:
2020
mocked_event_manager = AsyncMock(spec=EventManager)
21-
return Snapshotter(mocked_event_manager)
21+
return Snapshotter(mocked_event_manager, available_memory_ratio=0.25)
2222

2323

2424
@pytest.fixture
@@ -33,7 +33,7 @@ def event_system_data_info() -> EventSystemInfoData:
3333

3434

3535
async def test_start_stop_lifecycle() -> None:
36-
async with LocalEventManager() as event_manager, Snapshotter(event_manager):
36+
async with LocalEventManager() as event_manager, Snapshotter(event_manager, available_memory_ratio=0.25):
3737
pass
3838

3939

@@ -169,31 +169,27 @@ def test_snapshot_pruning_keeps_recent_records_unaffected(snapshotter: Snapshott
169169
assert snapshotter._cpu_snapshots[1].created_at == now
170170

171171

172-
def test_memory_load_evaluation_logs_warning_on_high_usage(
173-
monkeypatch: pytest.MonkeyPatch,
174-
snapshotter: Snapshotter,
175-
) -> None:
176-
mock_logger_warn = MagicMock()
177-
monkeypatch.setattr(getLogger('crawlee._autoscaling.snapshotter'), 'warning', mock_logger_warn)
178-
snapshotter._max_memory_size = ByteSize.from_gb(8)
172+
def test_memory_load_evaluation_logs_warning_on_high_usage(caplog: pytest.LogCaptureFixture) -> None:
173+
snapshotter = Snapshotter(AsyncMock(spec=EventManager), max_memory_size=ByteSize.from_gb(8))
179174

180-
high_memory_usage = ByteSize.from_gb(8) * 0.9 # 90% of 8 GB
175+
high_memory_usage = ByteSize.from_gb(8) * 0.95 # 95% of 8 GB
181176

182177
snapshotter._evaluate_memory_load(
183178
current_memory_usage_size=high_memory_usage,
184179
snapshot_timestamp=datetime.now(timezone.utc),
185180
)
186181

187-
assert mock_logger_warn.call_count == 1
188-
assert 'Memory is critically overloaded' in mock_logger_warn.call_args[0][0]
182+
assert len(caplog.records) == 1
183+
assert caplog.records[0].levelname.lower() == 'warning'
184+
assert 'Memory is critically overloaded' in caplog.records[0].msg
189185

190186
# It should not log again, since the last log was short time ago
191187
snapshotter._evaluate_memory_load(
192188
current_memory_usage_size=high_memory_usage,
193189
snapshot_timestamp=datetime.now(timezone.utc),
194190
)
195191

196-
assert mock_logger_warn.call_count == 1
192+
assert len(caplog.records) == 1
197193

198194

199195
def test_memory_load_evaluation_silent_on_acceptable_usage(

tests/unit/_autoscaling/test_system_status.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020

2121
@pytest.fixture
2222
async def snapshotter() -> AsyncGenerator[Snapshotter, None]:
23-
async with LocalEventManager() as event_manager, Snapshotter(event_manager) as snapshotter:
23+
async with LocalEventManager() as event_manager, Snapshotter(
24+
event_manager, available_memory_ratio=0.25
25+
) as snapshotter:
2426
yield snapshotter
2527

2628

@@ -30,7 +32,9 @@ def now() -> datetime:
3032

3133

3234
async def test_start_stop_lifecycle() -> None:
33-
async with LocalEventManager() as event_manager, Snapshotter(event_manager) as snapshotter:
35+
async with LocalEventManager() as event_manager, Snapshotter(
36+
event_manager, available_memory_ratio=0.25
37+
) as snapshotter:
3438
system_status = SystemStatus(snapshotter)
3539
system_status.get_current_system_info()
3640
system_status.get_historical_system_info()

0 commit comments

Comments
 (0)