Skip to content
This repository was archived by the owner on Nov 1, 2023. It is now read-only.

Commit 87eb606

Browse files
authored
Delete nodes when they're done (#1763)
* Delete nodes when they're done * Missed a file * Load node disposal strategy from env var * Lint * Fix subtle bug * Deleting doesn't work, will 'decomission' nodes once they complete work * Missed a file * Remove logging line
1 parent faaa5d2 commit 87eb606

File tree

3 files changed

+49
-28
lines changed

3 files changed

+49
-28
lines changed

src/api-service/__app__/onefuzzlib/azure/auto_scale.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ def create_auto_scale_profile(
156156
# When there's more than 1 message in the pool queue
157157
operator=ComparisonOperationType.GREATER_THAN_OR_EQUAL,
158158
threshold=1,
159+
divide_per_instance=False,
159160
),
160161
scale_action=ScaleAction(
161162
direction=ScaleDirection.INCREASE,
@@ -170,16 +171,17 @@ def create_auto_scale_profile(
170171
metric_trigger=MetricTrigger(
171172
metric_name="ApproximateMessageCount",
172173
metric_resource_uri=queue_uri,
173-
# Check every 20 minutes
174-
time_grain=timedelta(minutes=20),
174+
# Check every 10 minutes
175+
time_grain=timedelta(minutes=10),
175176
# The average amount of messages there are in the pool queue
176177
time_aggregation=TimeAggregationType.AVERAGE,
177178
statistic=MetricStatisticType.SUM,
178-
# Over the past 20 minutes
179-
time_window=timedelta(minutes=20),
179+
# Over the past 10 minutes
180+
time_window=timedelta(minutes=10),
180181
# When there's no messages in the pool queue
181182
operator=ComparisonOperationType.EQUALS,
182183
threshold=0,
184+
divide_per_instance=False,
183185
),
184186
scale_action=ScaleAction(
185187
direction=ScaleDirection.DECREASE,
@@ -194,7 +196,7 @@ def create_auto_scale_profile(
194196

195197
def default_auto_scale_profile(queue_uri: str, scaleset_size: int) -> AutoscaleProfile:
196198
return create_auto_scale_profile(
197-
queue_uri, 1, scaleset_size, scaleset_size, 1, 10, 1, 15
199+
queue_uri, 1, scaleset_size, scaleset_size, 1, 10, 1, 5
198200
)
199201

200202

src/api-service/__app__/onefuzzlib/workers/scalesets.py

Lines changed: 41 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import datetime
77
import logging
8+
import os
89
from typing import Any, Dict, List, Optional, Tuple, Union
910
from uuid import UUID
1011

@@ -437,8 +438,13 @@ def cleanup_nodes(self) -> bool:
437438

438439
# Perform operations until they fail due to scaleset getting locked
439440
try:
440-
self.reimage_nodes(to_reimage, NodeDisaposalStrategy.scale_in)
441-
self.delete_nodes(to_delete, NodeDisaposalStrategy.scale_in)
441+
strategy_str = os.getenv("ONEFUZZ_NODE_DISPOSAL_STRATEGY", "scale_in")
442+
if strategy_str == "decomission":
443+
strategy = NodeDisaposalStrategy.decomission
444+
else:
445+
strategy = NodeDisaposalStrategy.scale_in
446+
self.reimage_nodes(to_reimage, strategy)
447+
self.delete_nodes(to_delete, strategy)
442448
except UnableToUpdate:
443449
logging.info(
444450
SCALESET_LOG_PREFIX
@@ -598,17 +604,23 @@ def delete_nodes(
598604
else:
599605
machine_ids.add(node.machine_id)
600606

601-
logging.info(
602-
SCALESET_LOG_PREFIX + "deleting nodes scaleset_id:%s machine_id:%s",
603-
self.scaleset_id,
604-
machine_ids,
605-
)
606-
delete_vmss_nodes(self.scaleset_id, machine_ids)
607-
for node in nodes:
608-
if node.machine_id in machine_ids:
609-
node.delete()
610-
if disposal_strategy == NodeDisaposalStrategy.scale_in:
607+
if disposal_strategy == NodeDisaposalStrategy.decomission:
608+
logging.info(SCALESET_LOG_PREFIX + "decomissioning nodes")
609+
for node in nodes:
610+
if node.machine_id in machine_ids:
611611
node.release_scale_in_protection()
612+
else:
613+
logging.info(
614+
SCALESET_LOG_PREFIX + "deleting nodes scaleset_id:%s machine_id:%s",
615+
self.scaleset_id,
616+
machine_ids,
617+
)
618+
delete_vmss_nodes(self.scaleset_id, machine_ids)
619+
for node in nodes:
620+
if node.machine_id in machine_ids:
621+
node.delete()
622+
if disposal_strategy == NodeDisaposalStrategy.scale_in:
623+
node.release_scale_in_protection()
612624

613625
def reimage_nodes(
614626
self, nodes: List[Node], disposal_strategy: NodeDisaposalStrategy
@@ -659,18 +671,24 @@ def reimage_nodes(
659671
)
660672
return
661673

662-
result = reimage_vmss_nodes(self.scaleset_id, machine_ids)
663-
if isinstance(result, Error):
664-
raise Exception(
665-
"unable to reimage nodes: %s:%s - %s"
666-
% (self.scaleset_id, machine_ids, result)
667-
)
668-
669-
for node in nodes:
670-
if node.machine_id in machine_ids:
671-
node.delete()
672-
if disposal_strategy == NodeDisaposalStrategy.scale_in:
674+
if disposal_strategy == NodeDisaposalStrategy.decomission:
675+
logging.info(SCALESET_LOG_PREFIX + "decomissioning nodes")
676+
for node in nodes:
677+
if node.machine_id in machine_ids:
673678
node.release_scale_in_protection()
679+
else:
680+
result = reimage_vmss_nodes(self.scaleset_id, machine_ids)
681+
if isinstance(result, Error):
682+
raise Exception(
683+
"unable to reimage nodes: %s:%s - %s"
684+
% (self.scaleset_id, machine_ids, result)
685+
)
686+
687+
for node in nodes:
688+
if node.machine_id in machine_ids:
689+
node.delete()
690+
if disposal_strategy == NodeDisaposalStrategy.scale_in:
691+
node.release_scale_in_protection()
674692

675693
def set_shutdown(self, now: bool) -> None:
676694
if now:

src/pytypes/onefuzztypes/enums.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,3 +417,4 @@ class UserFieldType(Enum):
417417

418418
class NodeDisaposalStrategy(Enum):
419419
scale_in = "scale_in"
420+
decomission = "decomission"

0 commit comments

Comments
 (0)