Skip to content

Commit 9b7c249

Browse files
authored
sled-agent: Better handling of internal DNS gz address and DDM advertisements (#7786)
Two changes in this PR to fix two closely-related bugs: * Instead of waiting until after ensuring all zones have been destroyed/started to update the set of internal DNS prefixes we tell DDM to advertise, we now start advertising immediately after creating or deleting the gz address. Fixes #7782. * When shutting down an internal DNS zone, delete the gz address we created when we started it. Fixes #7785.
1 parent 63d0dc7 commit 9b7c249

File tree

3 files changed

+143
-34
lines changed

3 files changed

+143
-34
lines changed

illumos-utils/src/zone.rs

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use slog::Logger;
1212
use slog::info;
1313
use std::net::{IpAddr, Ipv6Addr};
1414

15+
use crate::ExecutionError;
1516
use crate::addrobj::AddrObject;
1617
use crate::dladm::{EtherstubVnic, VNIC_PREFIX_BOOTSTRAP, VNIC_PREFIX_CONTROL};
1718
use crate::zpool::PathInPool;
@@ -687,11 +688,21 @@ impl Zones {
687688
Ok(())
688689
}
689690

691+
/// Delete an address object.
692+
///
693+
/// This method attempts to be idempotent: deleting a nonexistent address
694+
/// object returns `Ok(())`.
690695
#[allow(clippy::needless_lifetimes)]
691696
pub fn delete_address<'a>(
692697
zone: Option<&'a str>,
693698
addrobj: &AddrObject,
694699
) -> Result<(), DeleteAddressError> {
700+
// Expected output on stderr if we try to delete an address that doesn't
701+
// exist. (We look for this error and return `Ok(_)`, making this method
702+
// idempotent.)
703+
const OBJECT_NOT_FOUND: &str =
704+
"could not delete address: Object not found";
705+
695706
let mut command = std::process::Command::new(PFEXEC);
696707
let mut args = vec![];
697708
if let Some(zone) = zone {
@@ -704,12 +715,19 @@ impl Zones {
704715
args.push(addrobj.to_string());
705716

706717
let cmd = command.args(args);
707-
execute(cmd).map_err(|err| DeleteAddressError {
708-
zone: zone.unwrap_or("global").to_string(),
709-
addrobj: addrobj.clone(),
710-
err,
711-
})?;
712-
Ok(())
718+
match execute(cmd) {
719+
Ok(_) => Ok(()),
720+
Err(ExecutionError::CommandFailure(err))
721+
if err.stderr.contains(OBJECT_NOT_FOUND) =>
722+
{
723+
Ok(())
724+
}
725+
Err(err) => Err(DeleteAddressError {
726+
zone: zone.unwrap_or("global").to_string(),
727+
addrobj: addrobj.clone(),
728+
err,
729+
}),
730+
}
713731
}
714732

715733
/// Ensures a link-local IPv6 exists with the name provided in `addrobj`.
@@ -879,4 +897,20 @@ mod tests {
879897
assert_eq!(parsed.prefix(), prefix);
880898
}
881899
}
900+
901+
// This test validates that we correctly detect an attempt to delete an
902+
// address that does not exist and return `Ok(())`.
903+
#[cfg(target_os = "illumos")]
904+
#[test]
905+
fn delete_nonexistent_address() {
906+
// We'll pick a name that hopefully no system actually has...
907+
let addr = AddrObject::new("nonsense", "shouldnotexist").unwrap();
908+
match Zones::delete_address(None, &addr) {
909+
Ok(()) => (),
910+
Err(err) => panic!(
911+
"unexpected error deleting nonexistent address: {}",
912+
slog_error_chain::InlineErrorChain::new(&err)
913+
),
914+
}
915+
}
882916
}

sled-agent/src/ddm_reconciler.rs

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,14 +86,27 @@ impl DdmReconciler {
8686
});
8787
}
8888

89-
pub fn set_internal_dns_subnets(
89+
/// Add an internal DNS subset to the set we should be advertising.
90+
///
91+
/// This method is idempotent.
92+
pub fn add_internal_dns_subnet(
9093
&self,
91-
internal_dns_subnets: BTreeSet<Ipv6Subnet<SLED_PREFIX>>,
94+
internal_dns_subnet: Ipv6Subnet<SLED_PREFIX>,
9295
) {
9396
self.prefixes.send_if_modified(|prefixes| {
94-
let modified = prefixes.internal_dns != internal_dns_subnets;
95-
prefixes.internal_dns = internal_dns_subnets;
96-
modified
97+
prefixes.internal_dns.insert(internal_dns_subnet)
98+
});
99+
}
100+
101+
/// Remove an internal DNS subnet from the set we should be advertising.
102+
///
103+
/// This method is idempotent.
104+
pub fn remove_internal_dns_subnet(
105+
&self,
106+
internal_dns_subnet: Ipv6Subnet<SLED_PREFIX>,
107+
) {
108+
self.prefixes.send_if_modified(|prefixes| {
109+
prefixes.internal_dns.remove(&internal_dns_subnet)
97110
});
98111
}
99112
}

sled-agent/src/services.rs

Lines changed: 85 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ use sled_storage::config::MountConfig;
111111
use sled_storage::dataset::{CONFIG_DATASET, INSTALL_DATASET, ZONE_DATASET};
112112
use sled_storage::manager::StorageHandle;
113113
use slog::Logger;
114+
use slog_error_chain::InlineErrorChain;
114115
use std::collections::BTreeMap;
115116
use std::collections::HashSet;
116117
use std::net::{IpAddr, Ipv6Addr, SocketAddr};
@@ -188,13 +189,20 @@ pub enum Error {
188189
#[error("Cannot list zones")]
189190
ZoneList(#[source] illumos_utils::zone::AdmError),
190191

191-
#[error("Cannot remove zone")]
192+
#[error("Cannot remove zone {zone_name}")]
192193
ZoneRemoval {
193194
zone_name: String,
194195
#[source]
195196
err: illumos_utils::zone::AdmError,
196197
},
197198

199+
#[error("Cannot clean up after removal of zone {zone_name}")]
200+
ZoneCleanup {
201+
zone_name: String,
202+
#[source]
203+
err: Box<illumos_utils::zone::DeleteAddressError>,
204+
},
205+
198206
#[error("Failed to boot zone: {0}")]
199207
ZoneBoot(#[from] illumos_utils::running_zone::BootError),
200208

@@ -2334,7 +2342,7 @@ impl ServiceManager {
23342342
// the same machine (which is effectively a
23352343
// developer-only environment -- we wouldn't want this
23362344
// in prod!), they need to be given distinct names.
2337-
let addr_name = format!("internaldns{gz_address_index}");
2345+
let addr_name = internal_dns_addrobj_name(*gz_address_index);
23382346
Zones::ensure_has_global_zone_v6_address(
23392347
self.inner.underlay_vnic.clone(),
23402348
*gz_address,
@@ -2348,6 +2356,17 @@ impl ServiceManager {
23482356
err,
23492357
})?;
23502358

2359+
// Tell DDM to start advertising our address. This may be
2360+
// premature: we haven't yet started the internal DNS server.
2361+
// However, the existence of the `internaldns{gz_address_index}`
2362+
// interface we just created means processes in the gz
2363+
// (including ourselves!) may use that as a _source_ address, so
2364+
// we need to go ahead and advertise this prefix so other sleds
2365+
// know how to route responses back to us. See
2366+
// <https://github.com/oxidecomputer/omicron/issues/7782>.
2367+
self.ddm_reconciler()
2368+
.add_internal_dns_subnet(Ipv6Subnet::new(*gz_address));
2369+
23512370
let internal_dns_config = PropertyGroupBuilder::new("config")
23522371
.add_property(
23532372
"http_address",
@@ -3600,20 +3619,6 @@ impl ServiceManager {
36003619
new_zones.into_iter().map(|zone| (zone.name().to_string(), zone)),
36013620
);
36023621

3603-
// Update our DDM reconciler with the set of all internal DNS global
3604-
// zone addresses we need maghemite to advertise on our behalf.
3605-
self.ddm_reconciler().set_internal_dns_subnets(
3606-
existing_zones
3607-
.values()
3608-
.filter_map(|z| match z.config.zone.zone_type {
3609-
OmicronZoneType::InternalDns { gz_address, .. } => {
3610-
Some(Ipv6Subnet::new(gz_address))
3611-
}
3612-
_ => None,
3613-
})
3614-
.collect(),
3615-
);
3616-
36173622
// If any zones failed to start, exit with an error
36183623
if !errors.is_empty() {
36193624
return Err(Error::ZoneEnsure { errors });
@@ -3659,20 +3664,29 @@ impl ServiceManager {
36593664
log,
36603665
"Failed to take bundle of unexpected zone";
36613666
"zone_name" => &expected_zone_name,
3662-
"reason" => ?e,
3667+
InlineErrorChain::new(&e),
36633668
);
36643669
}
36653670
if let Err(e) = zone.runtime.stop().await {
36663671
error!(log, "Failed to stop zone {}: {e}", zone.name());
36673672
}
3673+
if let Err(e) =
3674+
self.clean_up_after_zone_shutdown(&zone.config.zone).await
3675+
{
3676+
error!(
3677+
log,
3678+
"Failed to clean up after stopping zone {}", zone.name();
3679+
InlineErrorChain::new(&e),
3680+
);
3681+
}
36683682
}
36693683

36703684
// Ensures that if a zone is about to be installed, it does not exist.
36713685
async fn ensure_removed(
36723686
&self,
3673-
zone: &OmicronZoneConfig,
3687+
zone_config: &OmicronZoneConfig,
36743688
) -> Result<(), Error> {
3675-
let zone_name = zone.zone_name();
3689+
let zone_name = zone_config.zone_name();
36763690
match Zones::find(&zone_name).await {
36773691
Ok(Some(zone)) => {
36783692
warn!(
@@ -3695,18 +3709,62 @@ impl ServiceManager {
36953709
self.inner.log,
36963710
"Failed to remove zone";
36973711
"zone" => &zone_name,
3698-
"error" => %e,
3712+
InlineErrorChain::new(&e),
36993713
);
37003714
return Err(Error::ZoneRemoval {
37013715
zone_name: zone_name.to_string(),
37023716
err: e,
37033717
});
37043718
}
3705-
return Ok(());
3719+
if let Err(e) =
3720+
self.clean_up_after_zone_shutdown(zone_config).await
3721+
{
3722+
error!(
3723+
self.inner.log,
3724+
"Failed to clean up after removing zone";
3725+
"zone" => &zone_name,
3726+
InlineErrorChain::new(&e),
3727+
);
3728+
return Err(e);
3729+
}
3730+
Ok(())
37063731
}
3707-
Ok(None) => return Ok(()),
3708-
Err(err) => return Err(Error::ZoneList(err)),
3732+
Ok(None) => Ok(()),
3733+
Err(err) => Err(Error::ZoneList(err)),
3734+
}
3735+
}
3736+
3737+
// Perform any outside-the-zone cleanup required after shutting down a zone.
3738+
async fn clean_up_after_zone_shutdown(
3739+
&self,
3740+
zone: &OmicronZoneConfig,
3741+
) -> Result<(), Error> {
3742+
// Special teardown for internal DNS zones: delete the global zone
3743+
// address we created for it, and tell DDM to stop advertising the
3744+
// prefix of that address.
3745+
if let OmicronZoneType::InternalDns {
3746+
gz_address,
3747+
gz_address_index,
3748+
..
3749+
} = &zone.zone_type
3750+
{
3751+
let addrobj = AddrObject::new(
3752+
&self.inner.underlay_vnic.0,
3753+
&internal_dns_addrobj_name(*gz_address_index),
3754+
)
3755+
.expect("internal DNS address object name is well-formed");
3756+
Zones::delete_address(None, &addrobj).map_err(|err| {
3757+
Error::ZoneCleanup {
3758+
zone_name: zone.zone_name(),
3759+
err: Box::new(err),
3760+
}
3761+
})?;
3762+
3763+
self.ddm_reconciler()
3764+
.remove_internal_dns_subnet(Ipv6Subnet::new(*gz_address));
37093765
}
3766+
3767+
Ok(())
37103768
}
37113769

37123770
// Returns a zone filesystem mountpoint, after ensuring that U.2 storage
@@ -4783,6 +4841,10 @@ impl ServiceManager {
47834841
}
47844842
}
47854843

4844+
fn internal_dns_addrobj_name(gz_address_index: u32) -> String {
4845+
format!("internaldns{gz_address_index}")
4846+
}
4847+
47864848
#[derive(Debug)]
47874849
struct ReconciledNewZonesRequest {
47884850
zones_to_be_removed: HashSet<OmicronZoneConfig>,

0 commit comments

Comments
 (0)