Skip to content

gdb: fix sandbox function cancellation when gdb enabled #621

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/how-to-debug-a-hyperlight-guest.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ The Hyperlight `gdb` feature enables **KVM** and **MSHV** guest debugging to:
- read and write addresses
- step/continue
- get code offset from target
- stop when a crash occurs and only allow read access to the guest memory and registers

## Expected behavior

Expand All @@ -32,6 +33,10 @@ session of a guest binary running inside a Hyperlight sandbox on Linux.
- if two sandbox instances are created with the same debug port, the second
instance logs an error and the gdb thread will not be created, but the sandbox
will continue to run without gdb debugging
- when a crash happens, the debugger session remains active, and the guest
vCPU is stopped, allowing the gdb client to inspect the state of the guest.
The debug target will refuse any resume, step actions and write operations to
the guest memory and registers until the gdb client disconnects or the sandbox is stopped.

## Example

Expand Down
4 changes: 4 additions & 0 deletions src/hyperlight_host/src/hypervisor/gdb/event_loop.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ impl run_blocking::BlockingEventLoop for GdbBlockingEventLoop {
tid: (),
signal: Signal(SIGRTMIN() as u8),
},
VcpuStopReason::Crash => BaseStopReason::SignalWithThread {
tid: (),
signal: Signal(11),
},
VcpuStopReason::Unknown => {
log::warn!("Unknown stop reason received");

Expand Down
2 changes: 2 additions & 0 deletions src/hyperlight_host/src/hypervisor/gdb/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ pub(crate) struct X86_64Regs {
/// Defines the possible reasons for which a vCPU can be stopped when debugging
#[derive(Debug)]
pub enum VcpuStopReason {
Crash,
DoneStep,
/// Hardware breakpoint inserted by the hypervisor so the guest can be stopped
/// at the entry point. This is used to avoid the guest from executing
Expand Down Expand Up @@ -145,6 +146,7 @@ pub(crate) enum DebugResponse {
DisableDebug,
ErrorOccurred,
GetCodeSectionOffset(u64),
NotAllowed,
ReadAddr(Vec<u8>),
ReadRegisters(X86_64Regs),
RemoveHwBreakpoint(bool),
Expand Down
48 changes: 48 additions & 0 deletions src/hyperlight_host/src/hypervisor/gdb/x86_64_target.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ impl HyperlightSandboxTarget {

match self.send_command(DebugMsg::Continue)? {
DebugResponse::Continue => Ok(()),
DebugResponse::NotAllowed => {
log::error!("Action not allowed at this time, crash might have occurred");
// This is a consequence of the target crashing or being in an invalid state
// we cannot continue execution, but we can still read registers and memory
Ok(())
}
msg => {
log::error!("Unexpected message received: {:?}", msg);
Err(GdbTargetError::UnexpectedMessage)
Expand Down Expand Up @@ -164,6 +170,12 @@ impl SingleThreadBase for HyperlightSandboxTarget {

match self.send_command(DebugMsg::WriteAddr(gva, v))? {
DebugResponse::WriteAddr => Ok(()),
DebugResponse::NotAllowed => {
log::error!("Action not allowed at this time, crash might have occurred");
// This is a consequence of the target crashing or being in an invalid state
// we cannot continue execution, but we can still read registers and memory
Ok(())
}
DebugResponse::ErrorOccurred => {
log::error!("Error occurred");
Err(TargetError::NonFatal)
Expand Down Expand Up @@ -245,6 +257,12 @@ impl SingleThreadBase for HyperlightSandboxTarget {

match self.send_command(DebugMsg::WriteRegisters(regs))? {
DebugResponse::WriteRegisters => Ok(()),
DebugResponse::NotAllowed => {
log::error!("Action not allowed at this time, crash might have occurred");
// This is a consequence of the target crashing or being in an invalid state
// we cannot continue execution, but we can still read registers and memory
Ok(())
}
DebugResponse::ErrorOccurred => {
log::error!("Error occurred");
Err(TargetError::NonFatal)
Expand Down Expand Up @@ -301,6 +319,12 @@ impl HwBreakpoint for HyperlightSandboxTarget {

match self.send_command(DebugMsg::AddHwBreakpoint(addr))? {
DebugResponse::AddHwBreakpoint(rsp) => Ok(rsp),
DebugResponse::NotAllowed => {
log::error!("Action not allowed at this time, crash might have occurred");
// This is a consequence of the target crashing or being in an invalid state
// we cannot continue execution, but we can still read registers and memory
Err(TargetError::NonFatal)
}
DebugResponse::ErrorOccurred => {
log::error!("Error occurred");
Err(TargetError::NonFatal)
Expand All @@ -321,6 +345,12 @@ impl HwBreakpoint for HyperlightSandboxTarget {

match self.send_command(DebugMsg::RemoveHwBreakpoint(addr))? {
DebugResponse::RemoveHwBreakpoint(rsp) => Ok(rsp),
DebugResponse::NotAllowed => {
log::error!("Action not allowed at this time, crash might have occurred");
// This is a consequence of the target crashing or being in an invalid state
// we cannot continue execution, but we can still read registers and memory
Err(TargetError::NonFatal)
}
DebugResponse::ErrorOccurred => {
log::error!("Error occurred");
Err(TargetError::NonFatal)
Expand All @@ -343,6 +373,12 @@ impl SwBreakpoint for HyperlightSandboxTarget {

match self.send_command(DebugMsg::AddSwBreakpoint(addr))? {
DebugResponse::AddSwBreakpoint(rsp) => Ok(rsp),
DebugResponse::NotAllowed => {
log::error!("Action not allowed at this time, crash might have occurred");
// This is a consequence of the target crashing or being in an invalid state
// we cannot continue execution, but we can still read registers and memory
Err(TargetError::NonFatal)
}
DebugResponse::ErrorOccurred => {
log::error!("Error occurred");
Err(TargetError::NonFatal)
Expand All @@ -363,6 +399,12 @@ impl SwBreakpoint for HyperlightSandboxTarget {

match self.send_command(DebugMsg::RemoveSwBreakpoint(addr))? {
DebugResponse::RemoveSwBreakpoint(rsp) => Ok(rsp),
DebugResponse::NotAllowed => {
log::error!("Action not allowed at this time, crash might have occurred");
// This is a consequence of the target crashing or being in an invalid state
// we cannot continue execution, but we can still read registers and memory
Err(TargetError::NonFatal)
}
DebugResponse::ErrorOccurred => {
log::error!("Error occurred");
Err(TargetError::NonFatal)
Expand Down Expand Up @@ -398,6 +440,12 @@ impl SingleThreadSingleStep for HyperlightSandboxTarget {
log::error!("Error occurred");
Err(GdbTargetError::UnexpectedError)
}
DebugResponse::NotAllowed => {
log::error!("Action not allowed at this time, crash might have occurred");
// This is a consequence of the target crashing or being in an invalid state
// we cannot continue execution, but we can still read registers and memory
Ok(())
}
msg => {
log::error!("Unexpected message received: {:?}", msg);
Err(GdbTargetError::UnexpectedMessage)
Expand Down
162 changes: 137 additions & 25 deletions src/hyperlight_host/src/hypervisor/hyperv_linux.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ use {super::crashdump, std::path::Path};

use super::fpu::{FP_CONTROL_WORD_DEFAULT, FP_TAG_WORD_DEFAULT, MXCSR_DEFAULT};
#[cfg(gdb)]
use super::gdb::{DebugCommChannel, DebugMsg, DebugResponse, GuestDebug, MshvDebug};
use super::gdb::{
DebugCommChannel, DebugMsg, DebugResponse, GuestDebug, MshvDebug, VcpuStopReason,
};
#[cfg(gdb)]
use super::handlers::DbgMemAccessHandlerWrapper;
use super::handlers::{MemAccessHandlerWrapper, OutBHandlerWrapper};
Expand Down Expand Up @@ -749,6 +751,25 @@ impl Hypervisor for HypervLinuxDriver {
.store(false, Ordering::Relaxed);
HyperlightExit::Cancelled()
} else {
// In case of the gdb feature, if no cancellation was requested,
// and the debugging is enabled it means the vCPU was stopped because
// of an interrupt coming from the debugger thread
#[cfg(gdb)]
if self.debug.is_some() {
// If the vCPU was stopped because of an interrupt, we need to
// return a special exit reason so that the gdb thread can handle it
// and resume execution
// NOTE: There is a chance that the vCPU was stopped because of a stale
// signal that was meant to be delivered to a previous/other vCPU on this
// same thread, however, we cannot distinguish between the two cases, so
// we assume that the vCPU was stopped because of an interrupt.
// This is fine, because the debugger will be notified about an interrupt
HyperlightExit::Debug(VcpuStopReason::Interrupt)
} else {
HyperlightExit::Retry()
}

#[cfg(not(gdb))]
HyperlightExit::Retry()
}
}
Expand Down Expand Up @@ -835,39 +856,130 @@ impl Hypervisor for HypervLinuxDriver {
dbg_mem_access_fn: std::sync::Arc<
std::sync::Mutex<dyn super::handlers::DbgMemAccessHandlerCaller>,
>,
stop_reason: super::gdb::VcpuStopReason,
stop_reason: VcpuStopReason,
) -> Result<()> {
self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
.map_err(|e| new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e))?;
if self.debug.is_none() {
return Err(new_error!("Debugging is not enabled"));
}

loop {
log::debug!("Debug wait for event to resume vCPU");
match stop_reason {
// If the vCPU stopped because of a crash, we need to handle it differently
// We do not want to allow resuming execution or placing breakpoints
// because the guest has crashed.
// We only allow reading registers and memory
VcpuStopReason::Crash => {
self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
.map_err(|e| {
new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e)
})?;

loop {
log::debug!("Debug wait for event to resume vCPU");
// Wait for a message from gdb
let req = self.recv_dbg_msg()?;

// Flag to store if we should deny continue or step requests
let mut deny_continue = false;
// Flag to store if we should detach from the gdb session
let mut detach = false;

let response = match req {
// Allow the detach request to disable debugging by continuing resuming
// hypervisor crash error reporting
DebugMsg::DisableDebug => {
detach = true;
DebugResponse::DisableDebug
}
// Do not allow continue or step requests
DebugMsg::Continue | DebugMsg::Step => {
deny_continue = true;
DebugResponse::NotAllowed
}
// Do not allow adding/removing breakpoints and writing to memory or registers
DebugMsg::AddHwBreakpoint(_)
| DebugMsg::AddSwBreakpoint(_)
| DebugMsg::RemoveHwBreakpoint(_)
| DebugMsg::RemoveSwBreakpoint(_)
| DebugMsg::WriteAddr(_, _)
| DebugMsg::WriteRegisters(_) => DebugResponse::NotAllowed,

// For all other requests, we will process them normally
_ => {
let result = self.process_dbg_request(req, dbg_mem_access_fn.clone());
match result {
Ok(response) => response,
Err(HyperlightError::TranslateGuestAddress(_)) => {
// Treat non fatal errors separately so the guest doesn't fail
DebugResponse::ErrorOccurred
}
Err(e) => {
log::error!("Error processing debug request: {:?}", e);
return Err(e);
}
}
}
};

// Wait for a message from gdb
let req = self.recv_dbg_msg()?;
// Send the response to the request back to gdb
self.send_dbg_msg(response)
.map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;

let result = self.process_dbg_request(req, dbg_mem_access_fn.clone());
// If we are denying continue or step requests, the debugger assumes the
// execution started so we need to report a stop reason as a crash and let
// it request to read registers/memory to figure out what happened
if deny_continue {
self.send_dbg_msg(DebugResponse::VcpuStopped(VcpuStopReason::Crash))
.map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
}

let response = match result {
Ok(response) => response,
// Treat non fatal errors separately so the guest doesn't fail
Err(HyperlightError::TranslateGuestAddress(_)) => DebugResponse::ErrorOccurred,
Err(e) => {
return Err(e);
// If we are detaching, we will break the loop and the Hypervisor will continue
// to handle the Crash reason
if detach {
break;
}
}
};
}
// If the vCPU stopped because of any other reason except a crash, we can handle it
// normally
_ => {
// Send the stop reason to the gdb thread
self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
.map_err(|e| {
new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e)
})?;

loop {
log::debug!("Debug wait for event to resume vCPU");
// Wait for a message from gdb
let req = self.recv_dbg_msg()?;

let result = self.process_dbg_request(req, dbg_mem_access_fn.clone());

let response = match result {
Ok(response) => response,
// Treat non fatal errors separately so the guest doesn't fail
Err(HyperlightError::TranslateGuestAddress(_)) => {
DebugResponse::ErrorOccurred
}
Err(e) => {
return Err(e);
}
};

// If the command was either step or continue, we need to run the vcpu
let cont = matches!(
response,
DebugResponse::Step | DebugResponse::Continue | DebugResponse::DisableDebug
);
let cont = matches!(
response,
DebugResponse::Continue | DebugResponse::Step | DebugResponse::DisableDebug
);

self.send_dbg_msg(response)
.map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
self.send_dbg_msg(response)
.map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;

if cont {
break;
// Check if we should continue execution
// We continue if the response is one of the following: Step, Continue, or DisableDebug
if cont {
break;
}
}
}
}

Expand Down
Loading
Loading