Skip to content

Commit

Permalink
controller: use PageserverUtilization for scheduling (neondatabase#8711)
Browse files Browse the repository at this point in the history
## Problem

Previously, the controller only used the shard counts for scheduling.
This works well when hosting only many-sharded tenants, but works much
less well when hosting single-sharded tenants that have a greater
deviation in size-per-shard.

Closes: neondatabase#7798

## Summary of changes

- Instead of UtilizationScore, carry the full PageserverUtilization
through into the Scheduler.
- Use the PageserverUtilization::score() instead of shard count when
ordering nodes in scheduling.

Q: Why did test_sharding_split_smoke need updating in this PR?
A: There's an interesting side effect during shard splits: because we do
not decrement the shard count in the utilization when we de-schedule the
shards from before the split, the controller will now prefer to pick
_different_ nodes for shards compared with which ones held secondaries
before the split. We could use our knowledge of splitting to fix up the
utilizations more actively in this situation, but I'm leaning toward
leaving the code simpler, as in practical systems the impact of one
shard on the utilization of a node should be fairly low (single digit
%).
  • Loading branch information
jcsp authored Aug 23, 2024
1 parent c1cb7a0 commit b65a95f
Show file tree
Hide file tree
Showing 11 changed files with 340 additions and 101 deletions.
21 changes: 6 additions & 15 deletions libs/pageserver_api/src/controller_api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::time::{Duration, Instant};
use serde::{Deserialize, Serialize};
use utils::id::{NodeId, TenantId};

use crate::models::PageserverUtilization;
use crate::{
models::{ShardParameters, TenantConfig},
shard::{ShardStripeSize, TenantShardId},
Expand Down Expand Up @@ -140,23 +141,11 @@ pub struct TenantShardMigrateRequest {
pub node_id: NodeId,
}

/// Utilisation score indicating how good a candidate a pageserver
/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
/// Lower values are better.
#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
pub struct UtilizationScore(pub u64);

impl UtilizationScore {
pub fn worst() -> Self {
UtilizationScore(u64::MAX)
}
}

#[derive(Serialize, Clone, Copy, Debug)]
#[derive(Serialize, Clone, Debug)]
#[serde(into = "NodeAvailabilityWrapper")]
pub enum NodeAvailability {
// Normal, happy state
Active(UtilizationScore),
Active(PageserverUtilization),
// Node is warming up, but we expect it to become available soon. Covers
// the time span between the re-attach response being composed on the storage controller
// and the first successful heartbeat after the processing of the re-attach response
Expand Down Expand Up @@ -195,7 +184,9 @@ impl From<NodeAvailabilityWrapper> for NodeAvailability {
match val {
// Assume the worst utilisation score to begin with. It will later be updated by
// the heartbeats.
NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
NodeAvailabilityWrapper::Active => {
NodeAvailability::Active(PageserverUtilization::full())
}
NodeAvailabilityWrapper::WarmingUp => NodeAvailability::WarmingUp(Instant::now()),
NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
}
Expand Down
67 changes: 61 additions & 6 deletions libs/pageserver_api/src/models/utilization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ pub struct PageserverUtilization {
pub max_shard_count: u32,

/// Cached result of [`Self::score`]
pub utilization_score: u64,
pub utilization_score: Option<u64>,

/// When was this snapshot captured, pageserver local time.
///
Expand All @@ -50,6 +50,8 @@ fn unity_percent() -> Percent {
Percent::new(0).unwrap()
}

pub type RawScore = u64;

impl PageserverUtilization {
const UTILIZATION_FULL: u64 = 1000000;

Expand All @@ -62,7 +64,7 @@ impl PageserverUtilization {
/// - Negative values are forbidden
/// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
/// layer eviction.
pub fn score(&self) -> u64 {
pub fn score(&self) -> RawScore {
let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
* self.disk_usable_pct.get() as u64)
/ 100;
Expand All @@ -74,8 +76,30 @@ impl PageserverUtilization {
std::cmp::max(disk_utilization_score, shard_utilization_score)
}

pub fn refresh_score(&mut self) {
self.utilization_score = self.score();
pub fn cached_score(&mut self) -> RawScore {
match self.utilization_score {
None => {
let s = self.score();
self.utilization_score = Some(s);
s
}
Some(s) => s,
}
}

/// If a node is currently hosting more work than it can comfortably handle. This does not indicate that
/// it will fail, but it is a strong signal that more work should not be added unless there is no alternative.
pub fn is_overloaded(score: RawScore) -> bool {
score >= Self::UTILIZATION_FULL
}

pub fn adjust_shard_count_max(&mut self, shard_count: u32) {
if self.shard_count < shard_count {
self.shard_count = shard_count;

// Dirty cache: this will be calculated next time someone retrives the score
self.utilization_score = None;
}
}

/// A utilization structure that has a full utilization score: use this as a placeholder when
Expand All @@ -88,7 +112,38 @@ impl PageserverUtilization {
disk_usable_pct: Percent::new(100).unwrap(),
shard_count: 1,
max_shard_count: 1,
utilization_score: Self::UTILIZATION_FULL,
utilization_score: Some(Self::UTILIZATION_FULL),
captured_at: serde_system_time::SystemTime(SystemTime::now()),
}
}
}

/// Test helper
pub mod test_utilization {
use super::PageserverUtilization;
use std::time::SystemTime;
use utils::{
serde_percent::Percent,
serde_system_time::{self},
};

// Parameters of the imaginary node used for test utilization instances
const TEST_DISK_SIZE: u64 = 1024 * 1024 * 1024 * 1024;
const TEST_SHARDS_MAX: u32 = 1000;

/// Unit test helper. Unconditionally compiled because cfg(test) doesn't carry across crates. Do
/// not abuse this function from non-test code.
///
/// Emulates a node with a 1000 shard limit and a 1TB disk.
pub fn simple(shard_count: u32, disk_wanted_bytes: u64) -> PageserverUtilization {
PageserverUtilization {
disk_usage_bytes: disk_wanted_bytes,
free_space_bytes: TEST_DISK_SIZE - std::cmp::min(disk_wanted_bytes, TEST_DISK_SIZE),
disk_wanted_bytes,
disk_usable_pct: Percent::new(100).unwrap(),
shard_count,
max_shard_count: TEST_SHARDS_MAX,
utilization_score: None,
captured_at: serde_system_time::SystemTime(SystemTime::now()),
}
}
Expand Down Expand Up @@ -120,7 +175,7 @@ mod tests {
disk_usage_bytes: u64::MAX,
free_space_bytes: 0,
disk_wanted_bytes: u64::MAX,
utilization_score: 13,
utilization_score: Some(13),
disk_usable_pct: Percent::new(90).unwrap(),
shard_count: 100,
max_shard_count: 200,
Expand Down
8 changes: 8 additions & 0 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1803,6 +1803,14 @@ pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::n
.expect("failed to define a metric")
});

pub(crate) static NODE_UTILIZATION_SCORE: Lazy<UIntGauge> = Lazy::new(|| {
register_uint_gauge!(
"pageserver_utilization_score",
"The utilization score we report to the storage controller for scheduling, where 0 is empty, 1000000 is full, and anything above is considered overloaded",
)
.expect("failed to define a metric")
});

pub(crate) static SECONDARY_HEATMAP_TOTAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_secondary_heatmap_total_size",
Expand Down
18 changes: 13 additions & 5 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3741,13 +3741,21 @@ impl Tenant {
/// less than this (via eviction and on-demand downloads), but this function enables
/// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
/// by keeping important things on local disk.
///
/// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
/// than they report here, due to layer eviction. Tenants with many active branches may
/// actually use more than they report here.
pub(crate) fn local_storage_wanted(&self) -> u64 {
let mut wanted = 0;
let timelines = self.timelines.lock().unwrap();
for timeline in timelines.values() {
wanted += timeline.metrics.visible_physical_size_gauge.get();
}
wanted

// Heuristic: we use the max() of the timelines' visible sizes, rather than the sum. This
// reflects the observation that on tenants with multiple large branches, typically only one
// of them is used actively enough to occupy space on disk.
timelines
.values()
.map(|t| t.metrics.visible_physical_size_gauge.get())
.max()
.unwrap_or(0)
}
}

Expand Down
10 changes: 5 additions & 5 deletions pageserver/src/utilization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use utils::serde_percent::Percent;

use pageserver_api::models::PageserverUtilization;

use crate::{config::PageServerConf, tenant::mgr::TenantManager};
use crate::{config::PageServerConf, metrics::NODE_UTILIZATION_SCORE, tenant::mgr::TenantManager};

pub(crate) fn regenerate(
conf: &PageServerConf,
Expand Down Expand Up @@ -58,13 +58,13 @@ pub(crate) fn regenerate(
disk_usable_pct,
shard_count,
max_shard_count: MAX_SHARDS,
utilization_score: 0,
utilization_score: None,
captured_at: utils::serde_system_time::SystemTime(captured_at),
};

doc.refresh_score();

// TODO: make utilization_score into a metric
// Initialize `PageserverUtilization::utilization_score`
let score = doc.cached_score();
NODE_UTILIZATION_SCORE.set(score);

Ok(doc)
}
10 changes: 4 additions & 6 deletions storage_controller/src/heartbeater.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@ use std::{
};
use tokio_util::sync::CancellationToken;

use pageserver_api::{
controller_api::{NodeAvailability, UtilizationScore},
models::PageserverUtilization,
};
use pageserver_api::{controller_api::NodeAvailability, models::PageserverUtilization};

use thiserror::Error;
use utils::id::NodeId;
Expand Down Expand Up @@ -147,7 +144,8 @@ impl HeartbeaterTask {
// goes through to the pageserver even when the node is marked offline.
// This doesn't impact the availability observed by [`crate::service::Service`].
let mut node_clone = node.clone();
node_clone.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
node_clone
.set_availability(NodeAvailability::Active(PageserverUtilization::full()));

async move {
let response = node_clone
Expand Down Expand Up @@ -179,7 +177,7 @@ impl HeartbeaterTask {
node.get_availability()
{
PageserverState::WarmingUp {
started_at: last_seen_at,
started_at: *last_seen_at,
}
} else {
PageserverState::Offline
Expand Down
24 changes: 12 additions & 12 deletions storage_controller/src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,15 @@ impl Node {
}
}

pub(crate) fn get_availability(&self) -> NodeAvailability {
self.availability
pub(crate) fn get_availability(&self) -> &NodeAvailability {
&self.availability
}

pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
use AvailabilityTransition::*;
use NodeAvailability::WarmingUp;

match self.get_availability_transition(availability) {
match self.get_availability_transition(&availability) {
ToActive => {
// Give the node a new cancellation token, effectively resetting it to un-cancelled. Any
// users of previously-cloned copies of the node will still see the old cancellation
Expand All @@ -115,8 +115,8 @@ impl Node {
Unchanged | ToWarmingUpFromOffline => {}
}

if let (WarmingUp(crnt), WarmingUp(proposed)) = (self.availability, availability) {
self.availability = WarmingUp(std::cmp::max(crnt, proposed));
if let (WarmingUp(crnt), WarmingUp(proposed)) = (&self.availability, &availability) {
self.availability = WarmingUp(std::cmp::max(*crnt, *proposed));
} else {
self.availability = availability;
}
Expand All @@ -126,12 +126,12 @@ impl Node {
/// into a description of the transition.
pub(crate) fn get_availability_transition(
&self,
availability: NodeAvailability,
availability: &NodeAvailability,
) -> AvailabilityTransition {
use AvailabilityTransition::*;
use NodeAvailability::*;

match (self.availability, availability) {
match (&self.availability, availability) {
(Offline, Active(_)) => ToActive,
(Active(_), Offline) => ToOffline,
(Active(_), WarmingUp(_)) => ToWarmingUpFromActive,
Expand All @@ -153,15 +153,15 @@ impl Node {

/// Is this node elegible to have work scheduled onto it?
pub(crate) fn may_schedule(&self) -> MaySchedule {
let score = match self.availability {
NodeAvailability::Active(score) => score,
let utilization = match &self.availability {
NodeAvailability::Active(u) => u.clone(),
NodeAvailability::Offline | NodeAvailability::WarmingUp(_) => return MaySchedule::No,
};

match self.scheduling {
NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
NodeSchedulingPolicy::Active => MaySchedule::Yes(utilization),
NodeSchedulingPolicy::Draining => MaySchedule::No,
NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
NodeSchedulingPolicy::Filling => MaySchedule::Yes(utilization),
NodeSchedulingPolicy::Pause => MaySchedule::No,
NodeSchedulingPolicy::PauseForRestart => MaySchedule::No,
}
Expand Down Expand Up @@ -285,7 +285,7 @@ impl Node {
pub(crate) fn describe(&self) -> NodeDescribeResponse {
NodeDescribeResponse {
id: self.id,
availability: self.availability.into(),
availability: self.availability.clone().into(),
scheduling: self.scheduling,
listen_http_addr: self.listen_http_addr.clone(),
listen_http_port: self.listen_http_port,
Expand Down
Loading

0 comments on commit b65a95f

Please sign in to comment.