Skip to content

Commit ccc28fe

Browse files
authored
[sled-agent] Refactor service management out of StorageManager (#2946)
## History The Sled Agent has historically had two different "managers" responsible for Zones: 1. `ServiceManager`, which resided over zones that do not operate on Datasets 2. `StorageManager`, which manages disks, but also manages zones which operate on those disks This separation is even reflected in the sled agent API exposed to Nexus - the Sled Agent exposes: - `PUT /services` - `PUT /filesystem` For "add a service (within a zone) to this sled" vs "add a dataset (and corresponding zone) to this sled within a particular zpool". This has been kinda handy for Nexus, since "provision CRDB on this dataset" and "start the CRDB service on that dataset" don't need to be separate operations. Within the Sled Agent, however, it has been a pain-in-the-butt from a perspective of diverging implementations. The `StorageManager` and `ServiceManager` have evolved their own mechanisms for storing configs, identifying filesystems on which to place zpools, etc, even though their responsibilities (managing running zones) overlap quite a lot. ## This PR This PR migrates the responsibility for "service management" entirely into the `ServiceManager`, leaving the `StorageManager` responsible for monitoring disks. In detail, this means: - The responsibility for launching Clickhouse, CRDB, and Crucible zones has moved from `storage_manager.rs` into `services.rs` - Unfortunately, this also means we're taking a somewhat hacky approach to formatting CRDB. This is fixed in #2954. - The `StorageManager` no longer requires an Etherstub device during construction - The `ServiceZoneRequest` can operate on an optional `dataset` argument - The "config management" for datastore-based zones is now much more aligned with non-dataset zones. Each sled stores `/var/oxide/services.toml` and `/var/oxide/storage-services.toml` for each group. - These still need to be fixed with #2888 , but it should be simpler now. - `filesystem_ensure` - which previously asked the `StorageManager` to format a dataset and also launch a zone - now asks the `StorageManager` to format a dataset, and separately asks the `ServiceManager` to launch a zone. - In the future, this may become vectorized ("ensure the sled has *all* the datasets we want...") to have parity with the service management, but this would require a more invasive change in Nexus.
1 parent 4261090 commit ccc28fe

File tree

13 files changed

+777
-758
lines changed

13 files changed

+777
-758
lines changed

illumos-utils/src/running_zone.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -592,8 +592,8 @@ impl InstalledZone {
592592
///
593593
/// This results in a zone name which is distinct across different zpools,
594594
/// but stable and predictable across reboots.
595-
pub fn get_zone_name(zone_name: &str, unique_name: Option<&str>) -> String {
596-
let mut zone_name = format!("{}{}", ZONE_PREFIX, zone_name);
595+
pub fn get_zone_name(zone_type: &str, unique_name: Option<&str>) -> String {
596+
let mut zone_name = format!("{}{}", ZONE_PREFIX, zone_type);
597597
if let Some(suffix) = unique_name {
598598
zone_name.push_str(&format!("_{}", suffix));
599599
}
@@ -618,7 +618,7 @@ impl InstalledZone {
618618
log: &Logger,
619619
underlay_vnic_allocator: &VnicAllocator<Etherstub>,
620620
zone_root_path: &Path,
621-
zone_name: &str,
621+
zone_type: &str,
622622
unique_name: Option<&str>,
623623
datasets: &[zone::Dataset],
624624
filesystems: &[zone::Fs],
@@ -631,14 +631,14 @@ impl InstalledZone {
631631
let control_vnic =
632632
underlay_vnic_allocator.new_control(None).map_err(|err| {
633633
InstallZoneError::CreateVnic {
634-
zone: zone_name.to_string(),
634+
zone: zone_type.to_string(),
635635
err,
636636
}
637637
})?;
638638

639-
let full_zone_name = Self::get_zone_name(zone_name, unique_name);
639+
let full_zone_name = Self::get_zone_name(zone_type, unique_name);
640640
let zone_image_path =
641-
PathBuf::from(&format!("/opt/oxide/{}.tar.gz", zone_name));
641+
PathBuf::from(&format!("/opt/oxide/{}.tar.gz", zone_type));
642642

643643
let net_device_names: Vec<String> = opte_ports
644644
.iter()

illumos-utils/src/zpool.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ impl Zpool {
249249
}
250250

251251
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, JsonSchema)]
252+
#[serde(rename_all = "snake_case")]
252253
pub enum ZpoolKind {
253254
// This zpool is used for external storage (u.2)
254255
External,

openapi/sled-agent.json

Lines changed: 98 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -640,13 +640,6 @@
640640
{
641641
"type": "object",
642642
"properties": {
643-
"all_addresses": {
644-
"description": "The addresses of all nodes within the cluster.",
645-
"type": "array",
646-
"items": {
647-
"type": "string"
648-
}
649-
},
650643
"type": {
651644
"type": "string",
652645
"enum": [
@@ -655,7 +648,6 @@
655648
}
656649
},
657650
"required": [
658-
"all_addresses",
659651
"type"
660652
]
661653
},
@@ -689,6 +681,21 @@
689681
}
690682
]
691683
},
684+
"DatasetName": {
685+
"type": "object",
686+
"properties": {
687+
"kind": {
688+
"$ref": "#/components/schemas/DatasetKind"
689+
},
690+
"pool_name": {
691+
"$ref": "#/components/schemas/ZpoolName"
692+
}
693+
},
694+
"required": [
695+
"kind",
696+
"pool_name"
697+
]
698+
},
692699
"DendriteAsic": {
693700
"type": "string",
694701
"enum": [
@@ -1738,7 +1745,7 @@
17381745
"pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$"
17391746
},
17401747
"ServiceEnsureBody": {
1741-
"description": "Used to request that the Sled initialize certain services on initialization.\n\nThis may be used to record that certain sleds are responsible for launching services which may not be associated with a dataset, such as Nexus.",
1748+
"description": "Used to request that the Sled initialize multiple services.\n\nThis may be used to record that certain sleds are responsible for launching services which may not be associated with a dataset, such as Nexus.",
17421749
"type": "object",
17431750
"properties": {
17441751
"services": {
@@ -2036,6 +2043,48 @@
20362043
"mode",
20372044
"type"
20382045
]
2046+
},
2047+
{
2048+
"type": "object",
2049+
"properties": {
2050+
"type": {
2051+
"type": "string",
2052+
"enum": [
2053+
"clickhouse"
2054+
]
2055+
}
2056+
},
2057+
"required": [
2058+
"type"
2059+
]
2060+
},
2061+
{
2062+
"type": "object",
2063+
"properties": {
2064+
"type": {
2065+
"type": "string",
2066+
"enum": [
2067+
"cockroach_db"
2068+
]
2069+
}
2070+
},
2071+
"required": [
2072+
"type"
2073+
]
2074+
},
2075+
{
2076+
"type": "object",
2077+
"properties": {
2078+
"type": {
2079+
"type": "string",
2080+
"enum": [
2081+
"crucible"
2082+
]
2083+
}
2084+
},
2085+
"required": [
2086+
"type"
2087+
]
20392088
}
20402089
]
20412090
},
@@ -2050,6 +2099,15 @@
20502099
"format": "ipv6"
20512100
}
20522101
},
2102+
"dataset": {
2103+
"nullable": true,
2104+
"default": null,
2105+
"allOf": [
2106+
{
2107+
"$ref": "#/components/schemas/DatasetName"
2108+
}
2109+
]
2110+
},
20532111
"gz_addresses": {
20542112
"default": [],
20552113
"type": "array",
@@ -2080,6 +2138,7 @@
20802138
]
20812139
},
20822140
"ServiceZoneService": {
2141+
"description": "Used to request that the Sled initialize a single service.",
20832142
"type": "object",
20842143
"properties": {
20852144
"details": {
@@ -2481,13 +2540,16 @@
24812540
"description": "The type of zone which may be requested from Sled Agent",
24822541
"type": "string",
24832542
"enum": [
2543+
"clickhouse",
2544+
"cockroach_db",
2545+
"crucible_pantry",
2546+
"crucible",
24842547
"external_dns",
24852548
"internal_dns",
24862549
"nexus",
2550+
"ntp",
24872551
"oximeter",
2488-
"switch",
2489-
"crucible_pantry",
2490-
"ntp"
2552+
"switch"
24912553
]
24922554
},
24932555
"Zpool": {
@@ -2505,6 +2567,30 @@
25052567
"disk_type",
25062568
"id"
25072569
]
2570+
},
2571+
"ZpoolKind": {
2572+
"type": "string",
2573+
"enum": [
2574+
"external",
2575+
"internal"
2576+
]
2577+
},
2578+
"ZpoolName": {
2579+
"description": "A wrapper around a zpool name.\n\nThis expects that the format will be: `ox{i,p}_<UUID>` - we parse the prefix when reading the structure, and validate that the UUID can be utilized.",
2580+
"type": "object",
2581+
"properties": {
2582+
"id": {
2583+
"type": "string",
2584+
"format": "uuid"
2585+
},
2586+
"kind": {
2587+
"$ref": "#/components/schemas/ZpoolKind"
2588+
}
2589+
},
2590+
"required": [
2591+
"id",
2592+
"kind"
2593+
]
25082594
}
25092595
}
25102596
}

sled-agent/src/bootstrap/hardware.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -178,10 +178,7 @@ impl HardwareMonitor {
178178
let hardware = HardwareManager::new(log, sled_mode)
179179
.map_err(|e| Error::Hardware(e))?;
180180

181-
// TODO: The coupling between the storage and service manager is growing
182-
// pretty tight; we should consider merging them together.
183-
let storage_manager =
184-
StorageManager::new(&log, underlay_etherstub.clone()).await;
181+
let storage_manager = StorageManager::new(&log).await;
185182

186183
let service_manager = ServiceManager::new(
187184
log.clone(),

sled-agent/src/http_entrypoints.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ async fn filesystem_put(
103103
let sa = rqctx.context();
104104
let body_args = body.into_inner();
105105
sa.filesystem_ensure(
106+
body_args.id,
106107
body_args.zpool_id,
107108
body_args.dataset_kind,
108109
body_args.address,

sled-agent/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ mod services;
3131
mod sled_agent;
3232
mod smf_helper;
3333
pub mod sp;
34+
pub(crate) mod storage;
3435
mod storage_manager;
3536
mod updates;
3637

0 commit comments

Comments
 (0)