Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
a1d7ef4
Updated the command line params flow from test
gurasinghMS Nov 19, 2025
408e001
More updates to get the test running
gurasinghMS Nov 19, 2025
2e22fd8
Got the test to workg
gurasinghMS Nov 20, 2025
75187cf
Going to open this PR for review soon, removing the unneeded changes
gurasinghMS Nov 21, 2025
725104b
Added a comment for UpdateCli Params
gurasinghMS Nov 21, 2025
a0557fc
Fixing based on copilot comments
gurasinghMS Nov 21, 2025
b783e20
Added NvmeKeepaliveConfig
gurasinghMS Nov 21, 2025
2c029d3
jotting down some ideas
gurasinghMS Nov 21, 2025
dd3497e
build the nvme keepalive configuration correctly to reflect changes t…
gurasinghMS Nov 21, 2025
3de5082
Build should now be working
gurasinghMS Nov 21, 2025
aeb70c0
Most recent changes
gurasinghMS Nov 21, 2025
db59f94
Merge branch 'main' into new-keepalive-flag
gurasinghMS Nov 21, 2025
f63d23b
Fixed keep alive enum name
gurasinghMS Nov 22, 2025
4792d7a
Test now working as expected
gurasinghMS Nov 22, 2025
4705d32
Update openhcl/underhill_core/src/options.rs
gurasinghMS Nov 25, 2025
349d45c
Merge branch 'main' into new-keepalive-flag
gurasinghMS Nov 25, 2025
d398ff1
Pulling in changes
gurasinghMS Nov 27, 2025
d1e5a0f
Merge branch 'main' into new-keepalive-flag
gurasinghMS Nov 27, 2025
820d6f3
Merge branch 'main' into new-keepalive-flag
gurasinghMS Dec 1, 2025
5a25182
Merge branch 'main' into new-keepalive-flag
gurasinghMS Dec 1, 2025
000c01d
Merge branch 'main' into new-keepalive-flag
smalis-msft Dec 1, 2025
bc2e1e3
Merge branch 'main' into new-keepalive-flag
gurasinghMS Dec 1, 2025
55744e2
Merge branch 'main' into new-keepalive-flag
gurasinghMS Dec 1, 2025
260dfbd
Remove changes from netvsp
gurasinghMS Dec 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions openhcl/openhcl_boot/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -289,13 +289,25 @@ fn build_kernel_command_line(
)?;
}

// Only when explicitly supported by Host.
// Generate the NVMe keep alive command line which should look something
// like: OPENHCL_NVME_KEEP_ALIVE=disabled,host,privatepool
// TODO: Move from command line to device tree when stabilized.
if partition_info.nvme_keepalive
&& vtl2_pool_supported
&& !partition_info.boot_options.disable_nvme_keep_alive
{
write!(cmdline, "OPENHCL_NVME_KEEP_ALIVE=1 ")?;
write!(cmdline, "OPENHCL_NVME_KEEP_ALIVE=")?;

if partition_info.boot_options.disable_nvme_keep_alive {
write!(cmdline, "disabled,")?;
}

if partition_info.nvme_keepalive {
write!(cmdline, "host,")?;
} else {
write!(cmdline, "nohost,")?;
}

if vtl2_pool_supported {
write!(cmdline, "privatepool ")?;
} else {
write!(cmdline, "noprivatepool ")?;
Comment on lines +298 to +310
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When disable_nvme_keep_alive is false, the command line will not include the "disabled," prefix. However, when it's true, the pattern becomes "disabled,host,privatepool" or "disabled,nohost,noprivatepool", etc.

The FromStr implementation at line 97 handles strings starting with "disabled," but doesn't have explicit matches for patterns like "disabled,host,privatepool". While the catch-all at line 97 will map these to Disabled, this means the host and privatepool information is ignored even though it's being generated.

Consider either:

  1. Not generating the host/privatepool parts when disabled is true (since they're ignored)
  2. Or adding explicit parsing for these patterns if the information is meant to be preserved for logging/debugging
Suggested change
write!(cmdline, "disabled,")?;
}
if partition_info.nvme_keepalive {
write!(cmdline, "host,")?;
} else {
write!(cmdline, "nohost,")?;
}
if vtl2_pool_supported {
write!(cmdline, "privatepool ")?;
} else {
write!(cmdline, "noprivatepool ")?;
write!(cmdline, "disabled ")?;
} else {
if partition_info.nvme_keepalive {
write!(cmdline, "host,")?;
} else {
write!(cmdline, "nohost,")?;
}
if vtl2_pool_supported {
write!(cmdline, "privatepool ")?;
} else {
write!(cmdline, "noprivatepool ")?;
}

Copilot uses AI. Check for mistakes.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ignoring the host and private pool information when disable has explicitly been passed in should be fine because that's exactly what the "disable" is. it is an override for everything else. So I don't think this needs to be changed.

}
Comment on lines +308 to 311
Copy link

Copilot AI Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The space after "privatepool" and "noprivatepool" will become part of the environment variable value, causing parsing to fail. The FromStr implementation for KeepAliveConfig (line 93 in options.rs) does not trim whitespace before parsing.

Move the space outside the environment variable value:

if vtl2_pool_supported {
    write!(cmdline, "privatepool")?;
} else {
    write!(cmdline, "noprivatepool")?;
}
write!(cmdline, " ")?;
Suggested change
write!(cmdline, "privatepool ")?;
} else {
write!(cmdline, "noprivatepool ")?;
}
write!(cmdline, "privatepool")?;
} else {
write!(cmdline, "noprivatepool")?;
}
write!(cmdline, " ")?;

Copilot uses AI. Check for mistakes.

if let Some(sidecar) = sidecar {
Expand Down
73 changes: 48 additions & 25 deletions openhcl/underhill_core/src/dispatch/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ pub(crate) struct LoadedVm {

pub _periodic_telemetry_task: Task<()>,

pub nvme_keep_alive: bool,
pub nvme_keep_alive: KeepAliveConfig,
pub mana_keep_alive: KeepAliveConfig,
pub test_configuration: Option<TestScenarioConfig>,
pub dma_manager: OpenhclDmaManager,
Expand Down Expand Up @@ -306,7 +306,10 @@ impl LoadedVm {
WorkerRpc::Restart(rpc) => {
let state = async {
let running = self.stop().await;
match self.save(None, false, KeepAliveConfig::Disabled).await {
match self
.save(None, KeepAliveConfig::Disabled, KeepAliveConfig::Disabled)
.await
{
Ok(servicing_state) => Some((rpc, servicing_state)),
Err(err) => {
if running {
Expand Down Expand Up @@ -372,7 +375,9 @@ impl LoadedVm {
UhVmRpc::Save(rpc) => {
rpc.handle_failable(async |()| {
let running = self.stop().await;
let r = self.save(None, false, KeepAliveConfig::Disabled).await;
let r = self
.save(None, KeepAliveConfig::Disabled, KeepAliveConfig::Disabled)
.await;
if running {
self.start(None).await;
}
Expand Down Expand Up @@ -577,7 +582,10 @@ impl LoadedVm {

// NOTE: This is set via the corresponding env arg, as this feature is
// experimental.
let nvme_keepalive = self.nvme_keep_alive && capabilities_flags.enable_nvme_keepalive();
if !capabilities_flags.enable_nvme_keepalive() {
self.nvme_keep_alive = KeepAliveConfig::Disabled
};

if !capabilities_flags.enable_mana_keepalive() {
self.mana_keep_alive = KeepAliveConfig::Disabled
};
Expand All @@ -595,15 +603,23 @@ impl LoadedVm {
anyhow::bail!("cannot service underhill while paused");
}

let mut state = self.save(Some(deadline), nvme_keepalive, self.mana_keep_alive.clone()).await?;
let mut state = self
.save(
Some(deadline),
self.nvme_keep_alive.clone(),
self.mana_keep_alive.clone(),
)
.await?;
state.init_state.correlation_id = Some(correlation_id);

// Unload any network devices.
let shutdown_mana = async {
if let Some(network_settings) = self.network_settings.as_mut() {
network_settings
.unload_for_servicing()
.instrument(tracing::info_span!("shutdown_mana", CVM_ALLOWED, %correlation_id))
.instrument(
tracing::info_span!("shutdown_mana", CVM_ALLOWED, %correlation_id),
)
.await;
}
};
Expand All @@ -612,8 +628,13 @@ impl LoadedVm {
let shutdown_nvme = async {
if let Some(nvme_manager) = self.nvme_manager.take() {
nvme_manager
.shutdown(nvme_keepalive)
.instrument(tracing::info_span!("shutdown_nvme_vfio", CVM_ALLOWED, %correlation_id, %nvme_keepalive))
.shutdown(self.nvme_keep_alive.is_enabled())
.instrument(tracing::info_span!(
"shutdown_nvme_vfio",
CVM_ALLOWED,
correlation_id = %correlation_id,
nvme_keep_alive_enabled = self.nvme_keep_alive.is_enabled()
))
.await;
}
};
Expand All @@ -622,18 +643,19 @@ impl LoadedVm {
// restart.
let shutdown_pci = async {
pci_shutdown::shutdown_pci_devices()
.instrument(tracing::info_span!("shutdown_pci_devices", CVM_ALLOWED, %correlation_id))
.instrument(
tracing::info_span!("shutdown_pci_devices", CVM_ALLOWED, %correlation_id),
)
.await
};

// Save the persisted state used by the next openhcl_boot.
let cpus_with_mapped_interrupts = match state
.init_state
.nvme_state
.as_ref() {
Some(nvme_state) => crate::nvme_manager::save_restore::cpus_with_interrupts(&nvme_state.nvme_state),
None => vec![],
};
let cpus_with_mapped_interrupts = match state.init_state.nvme_state.as_ref() {
Some(nvme_state) => {
crate::nvme_manager::save_restore::cpus_with_interrupts(&nvme_state.nvme_state)
}
None => vec![],
};

crate::loader::vtl2_config::write_persisted_info(
self.runtime_params.parsed_openhcl_boot(),
Expand Down Expand Up @@ -772,7 +794,7 @@ impl LoadedVm {
async fn save(
&mut self,
_deadline: Option<std::time::Instant>,
nvme_keepalive_flag: bool,
nvme_keepalive_mode: KeepAliveConfig,
mana_keepalive_mode: KeepAliveConfig,
) -> anyhow::Result<ServicingState> {
assert!(!self.state_units.is_running());
Expand All @@ -785,23 +807,24 @@ impl LoadedVm {
//
// This has to happen before saving the network state, otherwise its allocations
// are marked as Free and are unable to be restored.
let dma_manager_state = if nvme_keepalive_flag || mana_keepalive_mode.is_enabled() {
use vmcore::save_restore::SaveRestore;
Some(self.dma_manager.save().context("dma_manager save failed")?)
} else {
None
};
let dma_manager_state =
if nvme_keepalive_mode.is_enabled() || mana_keepalive_mode.is_enabled() {
use vmcore::save_restore::SaveRestore;
Some(self.dma_manager.save().context("dma_manager save failed")?)
} else {
None
};

// Only save NVMe state when there are NVMe controllers and keep alive
// was enabled.
let nvme_state = if let Some(n) = &self.nvme_manager {
// DEVNOTE: A subtlety here is that the act of saving the NVMe state also causes the driver
// to enter a state where subsequent teardown operations will noop. There is a STRONG
// correlation between save/restore and keepalive.
n.save(nvme_keepalive_flag)
n.save(nvme_keepalive_mode.is_enabled())
.instrument(tracing::info_span!(
"nvme_manager_save",
nvme_keepalive_flag,
nvme_keepalive_mode = %nvme_keepalive_mode.is_enabled(),
CVM_ALLOWED
))
.await
Expand Down
30 changes: 26 additions & 4 deletions openhcl/underhill_core/src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ impl FromStr for KeepAliveConfig {
"host,privatepool" => Ok(KeepAliveConfig::EnabledHostAndPrivatePoolPresent),
"nohost,privatepool" => Ok(KeepAliveConfig::DisabledHostAndPrivatePoolPresent),
"nohost,noprivatepool" => Ok(KeepAliveConfig::Disabled),
x if x.starts_with("disabled,") => Ok(KeepAliveConfig::Disabled),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should add this disabled,.. variant. Instead, I think we should add a host,noprivatepool variant to the parsing that maps to a new enum variant KeepAliveConfig::EnabledHostAndPrivatePoolMissing that maps to the scenario where the host supports keepalive but it's been explicitly disabled.

_ => Err(anyhow::anyhow!("Invalid keepalive config: {}", s)),
}
}
Expand Down Expand Up @@ -207,15 +208,23 @@ pub struct Options {
/// hit exits.
pub no_sidecar_hotplug: bool,

/// (OPENHCL_NVME_KEEP_ALIVE=1) Enable nvme keep alive when servicing.
pub nvme_keep_alive: bool,
/// (OPENHCL_NVME_KEEP_ALIVE=\<KeepaliveConfig\>)
/// Configure NVMe keep alive behavior when servicing.
/// Options are:
/// - "host,privatepool" - Enable keep alive if both host and private pool support it.
/// - "nohost,privatepool" - Used when the host does not support keepalive, but a private pool is present. Keepalive is disabled.
/// - "nohost,noprivatepool" - Keepalive is disabled.
/// - "disabled, X, X" - Keepalive is disabled due to manual
/// override. Host and private pool options are ignored.
pub nvme_keep_alive: KeepAliveConfig,

/// (OPENHCL_MANA_KEEP_ALIVE=\<KeepAliveConfig\>)
/// Configure MANA keep alive behavior when servicing.
/// Options are:
/// - "host,privatepool" - Enable keep alive if both host and private pool support it.
/// - "nohost,privatepool" - Used when the host does not support keepalive, but a private pool is present. Keepalive is disabled.
/// - "nohost,noprivatepool" - Keepalive is disabled.
/// - "disabled, X, X" - TODO: This needs to be implemented for mana.
pub mana_keep_alive: KeepAliveConfig,

/// (OPENHCL_NVME_ALWAYS_FLR=1)
Expand Down Expand Up @@ -366,15 +375,28 @@ impl Options {
let no_sidecar_hotplug = parse_legacy_env_bool("OPENHCL_NO_SIDECAR_HOTPLUG");
let gdbstub = parse_legacy_env_bool("OPENHCL_GDBSTUB");
let gdbstub_port = parse_legacy_env_number("OPENHCL_GDBSTUB_PORT")?.map(|x| x as u32);
let nvme_keep_alive = parse_env_bool("OPENHCL_NVME_KEEP_ALIVE");
let nvme_keep_alive = read_env("OPENHCL_NVME_KEEP_ALIVE")
.map(|x| {
let s = x.to_string_lossy();
match s.parse::<KeepAliveConfig>() {
Ok(v) => v,
Err(e) => {
tracing::warn!(
"failed to parse OPENHCL_NVME_KEEP_ALIVE ('{s}'): {e}. Nvme keepalive will be disabled."
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NOTE: Decided to keep this here as opposed to defaulting to disabled in parse. That way we can more accurately log which variable parsing is failing (nvme vs mana)

);
KeepAliveConfig::Disabled
}
}
})
.unwrap_or(KeepAliveConfig::Disabled);
let mana_keep_alive = read_env("OPENHCL_MANA_KEEP_ALIVE")
.map(|x| {
let s = x.to_string_lossy();
match s.parse::<KeepAliveConfig>() {
Ok(v) => v,
Err(e) => {
tracing::warn!(
"failed to parse OPENHCL_MANA_KEEP_ALIVE ('{s}'): {e}. keepalive will be disabled."
"failed to parse OPENHCL_MANA_KEEP_ALIVE ('{s}'): {e}. Mana keepalive will be disabled."
);
KeepAliveConfig::Disabled
}
Expand Down
4 changes: 2 additions & 2 deletions openhcl/underhill_core/src/worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ pub struct UnderhillEnvCfg {
/// Hide the isolation mode from the guest.
pub hide_isolation: bool,
/// Enable nvme keep alive.
pub nvme_keep_alive: bool,
pub nvme_keep_alive: KeepAliveConfig,
/// Enable mana keep alive.
pub mana_keep_alive: KeepAliveConfig,
/// Don't skip FLR for NVMe devices.
Expand Down Expand Up @@ -2174,7 +2174,7 @@ async fn new_underhill_vm(
// TODO: reevaluate enablement of nvme save restore when private pool
// save restore to bootshim is available.
let private_pool_available = !runtime_params.private_pool_ranges().is_empty();
let save_restore_supported = env_cfg.nvme_keep_alive && private_pool_available;
let save_restore_supported = env_cfg.nvme_keep_alive.is_enabled() && private_pool_available;

let manager = NvmeManager::new(
&driver_source,
Expand Down
Loading