Skip to content

Commit

Permalink
pageserver: implement utilization score (neondatabase#8703)
Browse files Browse the repository at this point in the history
## Problem

When the utilization API was added, it was just a stub with disk space
information.

Disk space information isn't a very good metric for assigning tenants to
pageservers, because pageservers making full use of their disks would
always just have 85% utilization, irrespective of how much pressure they
had for disk space.

## Summary of changes

- Use the new layer visibiilty metric to calculate a "wanted size" per
tenant, and sum these to get a total local disk space wanted per
pageserver. This acts as the primary signal for utilization.
- Also use the shard count to calculate a utilization score, and take
the max of this and the disk-driven utilization. The shard count limit
is currently set as a constant 20,000, which matches contemporary
operational practices when loading pageservers.

The shard count limit means that for tiny/empty tenants, on a machine
with 3.84TB disk, each tiny tenant influences the utilization score as
if it had size 160MB.
  • Loading branch information
jcsp authored Aug 13, 2024
1 parent afb68b0 commit ecb0183
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 17 deletions.
90 changes: 82 additions & 8 deletions libs/pageserver_api/src/models/utilization.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use utils::serde_system_time::SystemTime;
use std::time::SystemTime;
use utils::{serde_percent::Percent, serde_system_time};

/// Pageserver current utilization and scoring for how good candidate the pageserver would be for
/// the next tenant.
Expand All @@ -9,19 +10,88 @@ use utils::serde_system_time::SystemTime;
/// not handle full u64 values properly.
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
pub struct PageserverUtilization {
/// Used disk space
/// Used disk space (physical, ground truth from statfs())
#[serde(serialize_with = "ser_saturating_u63")]
pub disk_usage_bytes: u64,
/// Free disk space
#[serde(serialize_with = "ser_saturating_u63")]
pub free_space_bytes: u64,
/// Lower is better score for how good candidate for a next tenant would this pageserver be.
#[serde(serialize_with = "ser_saturating_u63")]

/// Wanted disk space, based on the tenant shards currently present on this pageserver: this
/// is like disk_usage_bytes, but it is stable and does not change with the cache state of
/// tenants, whereas disk_usage_bytes may reach the disk eviction `max_usage_pct` and stay
/// there, or may be unrealistically low if the pageserver has attached tenants which haven't
/// downloaded layers yet.
#[serde(serialize_with = "ser_saturating_u63", default)]
pub disk_wanted_bytes: u64,

// What proportion of total disk space will this pageserver use before it starts evicting data?
#[serde(default = "unity_percent")]
pub disk_usable_pct: Percent,

// How many shards are currently on this node?
#[serde(default)]
pub shard_count: u32,

// How many shards should this node be able to handle at most?
#[serde(default)]
pub max_shard_count: u32,

/// Cached result of [`Self::score`]
pub utilization_score: u64,

/// When was this snapshot captured, pageserver local time.
///
/// Use millis to give confidence that the value is regenerated often enough.
pub captured_at: SystemTime,
pub captured_at: serde_system_time::SystemTime,
}

fn unity_percent() -> Percent {
Percent::new(0).unwrap()
}

impl PageserverUtilization {
const UTILIZATION_FULL: u64 = 1000000;

/// Calculate a utilization score. The result is to be inrepreted as a fraction of
/// Self::UTILIZATION_FULL.
///
/// Lower values are more affine to scheduling more work on this node.
/// - UTILIZATION_FULL represents an ideal node which is fully utilized but should not receive any more work.
/// - 0.0 represents an empty node.
/// - Negative values are forbidden
/// - Values over UTILIZATION_FULL indicate an overloaded node, which may show degraded performance due to
/// layer eviction.
pub fn score(&self) -> u64 {
let disk_usable_capacity = ((self.disk_usage_bytes + self.free_space_bytes)
* self.disk_usable_pct.get() as u64)
/ 100;
let disk_utilization_score =
self.disk_wanted_bytes * Self::UTILIZATION_FULL / disk_usable_capacity;

let shard_utilization_score =
self.shard_count as u64 * Self::UTILIZATION_FULL / self.max_shard_count as u64;
std::cmp::max(disk_utilization_score, shard_utilization_score)
}

pub fn refresh_score(&mut self) {
self.utilization_score = self.score();
}

/// A utilization structure that has a full utilization score: use this as a placeholder when
/// you need a utilization but don't have real values yet.
pub fn full() -> Self {
Self {
disk_usage_bytes: 1,
free_space_bytes: 0,
disk_wanted_bytes: 1,
disk_usable_pct: Percent::new(100).unwrap(),
shard_count: 1,
max_shard_count: 1,
utilization_score: Self::UTILIZATION_FULL,
captured_at: serde_system_time::SystemTime(SystemTime::now()),
}
}
}

/// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
Expand Down Expand Up @@ -49,15 +119,19 @@ mod tests {
let doc = PageserverUtilization {
disk_usage_bytes: u64::MAX,
free_space_bytes: 0,
utilization_score: u64::MAX,
captured_at: SystemTime(
disk_wanted_bytes: u64::MAX,
utilization_score: 13,
disk_usable_pct: Percent::new(90).unwrap(),
shard_count: 100,
max_shard_count: 200,
captured_at: serde_system_time::SystemTime(
std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
),
};

let s = serde_json::to_string(&doc).unwrap();

let expected = r#"{"disk_usage_bytes":9223372036854775807,"free_space_bytes":0,"utilization_score":9223372036854775807,"captured_at":"2024-02-21T10:02:59.000Z"}"#;
let expected = "{\"disk_usage_bytes\":9223372036854775807,\"free_space_bytes\":0,\"disk_wanted_bytes\":9223372036854775807,\"disk_usable_pct\":90,\"shard_count\":100,\"max_shard_count\":200,\"utilization_score\":13,\"captured_at\":\"2024-02-21T10:02:59.000Z\"}";

assert_eq!(s, expected);
}
Expand Down
5 changes: 3 additions & 2 deletions pageserver/src/http/routes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2357,8 +2357,9 @@ async fn get_utilization(
// regenerate at most 1Hz to allow polling at any rate.
if !still_valid {
let path = state.conf.tenants_path();
let doc = crate::utilization::regenerate(path.as_std_path())
.map_err(ApiError::InternalServerError)?;
let doc =
crate::utilization::regenerate(state.conf, path.as_std_path(), &state.tenant_manager)
.map_err(ApiError::InternalServerError)?;

let mut buf = Vec::new();
serde_json::to_writer(&mut buf, &doc)
Expand Down
13 changes: 13 additions & 0 deletions pageserver/src/tenant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3732,6 +3732,19 @@ impl Tenant {
pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt {
self.tenant_conf.load().tenant_conf.clone()
}

/// How much local storage would this tenant like to have? It can cope with
/// less than this (via eviction and on-demand downloads), but this function enables
/// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
/// by keeping important things on local disk.
pub(crate) fn local_storage_wanted(&self) -> u64 {
let mut wanted = 0;
let timelines = self.timelines.lock().unwrap();
for timeline in timelines.values() {
wanted += timeline.metrics.visible_physical_size_gauge.get();
}
wanted
}
}

/// Create the cluster temporarily in 'initdbpath' directory inside the repository
Expand Down
51 changes: 51 additions & 0 deletions pageserver/src/tenant/mgr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2085,6 +2085,57 @@ impl TenantManager {
}
}
}

/// Calculate the tenant shards' contributions to this pageserver's utilization metrics. The
/// returned values are:
/// - the number of bytes of local disk space this pageserver's shards are requesting, i.e.
/// how much space they would use if not impacted by disk usage eviction.
/// - the number of tenant shards currently on this pageserver, including attached
/// and secondary.
///
/// This function is quite expensive: callers are expected to cache the result and
/// limit how often they call it.
pub(crate) fn calculate_utilization(&self) -> Result<(u64, u32), TenantMapListError> {
let tenants = self.tenants.read().unwrap();
let m = match &*tenants {
TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
};
let shard_count = m.len();
let mut wanted_bytes = 0;

for tenant_slot in m.values() {
match tenant_slot {
TenantSlot::InProgress(_barrier) => {
// While a slot is being changed, we can't know how much storage it wants. This
// means this function's output can fluctuate if a lot of changes are going on
// (such as transitions from secondary to attached).
//
// We could wait for the barrier and retry, but it's important that the utilization
// API is responsive, and the data quality impact is not very significant.
continue;
}
TenantSlot::Attached(tenant) => {
wanted_bytes += tenant.local_storage_wanted();
}
TenantSlot::Secondary(secondary) => {
let progress = secondary.progress.lock().unwrap();
wanted_bytes += if progress.heatmap_mtime.is_some() {
// If we have heatmap info, then we will 'want' the sum
// of the size of layers in the heatmap: this is how much space
// we would use if not doing any eviction.
progress.bytes_total
} else {
// In the absence of heatmap info, assume that the secondary location simply
// needs as much space as it is currently using.
secondary.resident_size_metric.get()
}
}
}
}

Ok((wanted_bytes, shard_count as u32))
}
}

#[derive(Debug, thiserror::Error)]
Expand Down
34 changes: 27 additions & 7 deletions pageserver/src/utilization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,17 @@

use anyhow::Context;
use std::path::Path;
use utils::serde_percent::Percent;

use pageserver_api::models::PageserverUtilization;

pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtilization> {
// TODO: currently the http api ratelimits this to 1Hz at most, which is probably good enough
use crate::{config::PageServerConf, tenant::mgr::TenantManager};

pub(crate) fn regenerate(
conf: &PageServerConf,
tenants_path: &Path,
tenant_manager: &TenantManager,
) -> anyhow::Result<PageserverUtilization> {
let statvfs = nix::sys::statvfs::statvfs(tenants_path)
.map_err(std::io::Error::from)
.context("statvfs tenants directory")?;
Expand All @@ -34,16 +39,31 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result<PageserverUtiliz

let captured_at = std::time::SystemTime::now();

let doc = PageserverUtilization {
// Calculate aggregate utilization from tenants on this pageserver
let (disk_wanted_bytes, shard_count) = tenant_manager.calculate_utilization()?;

// Fetch the fraction of disk space which may be used
let disk_usable_pct = match conf.disk_usage_based_eviction.clone() {
Some(e) => e.max_usage_pct,
None => Percent::new(100).unwrap(),
};

// Express a static value for how many shards we may schedule on one node
const MAX_SHARDS: u32 = 20000;

let mut doc = PageserverUtilization {
disk_usage_bytes: used,
free_space_bytes: free,
// lower is better; start with a constant
//
// note that u64::MAX will be output as i64::MAX as u64, but that should not matter
utilization_score: u64::MAX,
disk_wanted_bytes,
disk_usable_pct,
shard_count,
max_shard_count: MAX_SHARDS,
utilization_score: 0,
captured_at: utils::serde_system_time::SystemTime(captured_at),
};

doc.refresh_score();

// TODO: make utilization_score into a metric

Ok(doc)
Expand Down

0 comments on commit ecb0183

Please sign in to comment.