Skip to content

Commit cb3ed3c

Browse files
authored
gdb: fix sandbox function cancellation when gdb enabled (#621)
- when the `gdb` features is enabled, the EINTR exit reason was always mapped to a Debug stop reason. With the new changes to the threading model, the EINTR exit reason can be a result of a cancel request. We need to first check if a cancel was requested, if not, it means a sandbox with debugging enabled was issued an interrupt request from the debugger - adds a way for a sandbox under debugging to remain attached to the debugger when a crash happens. This way the crash can be inspected. This forbids continuing execution and writing to memory and registers. Signed-off-by: Doru Blânzeanu <dblnz@pm.me>
1 parent e280a60 commit cb3ed3c

File tree

7 files changed

+347
-58
lines changed

7 files changed

+347
-58
lines changed

docs/how-to-debug-a-hyperlight-guest.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ The Hyperlight `gdb` feature enables **KVM** and **MSHV** guest debugging to:
1414
- read and write addresses
1515
- step/continue
1616
- get code offset from target
17+
- stop when a crash occurs and only allow read access to the guest memory and registers
1718

1819
## Expected behavior
1920

@@ -32,6 +33,10 @@ session of a guest binary running inside a Hyperlight sandbox on Linux.
3233
- if two sandbox instances are created with the same debug port, the second
3334
instance logs an error and the gdb thread will not be created, but the sandbox
3435
will continue to run without gdb debugging
36+
- when a crash happens, the debugger session remains active, and the guest
37+
vCPU is stopped, allowing the gdb client to inspect the state of the guest.
38+
The debug target will refuse any resume, step actions and write operations to
39+
the guest memory and registers until the gdb client disconnects or the sandbox is stopped.
3540

3641
## Example
3742

src/hyperlight_host/src/hypervisor/gdb/event_loop.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@ impl run_blocking::BlockingEventLoop for GdbBlockingEventLoop {
5858
tid: (),
5959
signal: Signal(SIGRTMIN() as u8),
6060
},
61+
VcpuStopReason::Crash => BaseStopReason::SignalWithThread {
62+
tid: (),
63+
signal: Signal(11),
64+
},
6165
VcpuStopReason::Unknown => {
6266
log::warn!("Unknown stop reason received");
6367

src/hyperlight_host/src/hypervisor/gdb/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ pub(crate) struct X86_64Regs {
108108
/// Defines the possible reasons for which a vCPU can be stopped when debugging
109109
#[derive(Debug)]
110110
pub enum VcpuStopReason {
111+
Crash,
111112
DoneStep,
112113
/// Hardware breakpoint inserted by the hypervisor so the guest can be stopped
113114
/// at the entry point. This is used to avoid the guest from executing
@@ -145,6 +146,7 @@ pub(crate) enum DebugResponse {
145146
DisableDebug,
146147
ErrorOccurred,
147148
GetCodeSectionOffset(u64),
149+
NotAllowed,
148150
ReadAddr(Vec<u8>),
149151
ReadRegisters(X86_64Regs),
150152
RemoveHwBreakpoint(bool),

src/hyperlight_host/src/hypervisor/gdb/x86_64_target.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,12 @@ impl HyperlightSandboxTarget {
7777

7878
match self.send_command(DebugMsg::Continue)? {
7979
DebugResponse::Continue => Ok(()),
80+
DebugResponse::NotAllowed => {
81+
log::error!("Action not allowed at this time, crash might have occurred");
82+
// This is a consequence of the target crashing or being in an invalid state
83+
// we cannot continue execution, but we can still read registers and memory
84+
Ok(())
85+
}
8086
msg => {
8187
log::error!("Unexpected message received: {:?}", msg);
8288
Err(GdbTargetError::UnexpectedMessage)
@@ -164,6 +170,12 @@ impl SingleThreadBase for HyperlightSandboxTarget {
164170

165171
match self.send_command(DebugMsg::WriteAddr(gva, v))? {
166172
DebugResponse::WriteAddr => Ok(()),
173+
DebugResponse::NotAllowed => {
174+
log::error!("Action not allowed at this time, crash might have occurred");
175+
// This is a consequence of the target crashing or being in an invalid state
176+
// we cannot continue execution, but we can still read registers and memory
177+
Ok(())
178+
}
167179
DebugResponse::ErrorOccurred => {
168180
log::error!("Error occurred");
169181
Err(TargetError::NonFatal)
@@ -245,6 +257,12 @@ impl SingleThreadBase for HyperlightSandboxTarget {
245257

246258
match self.send_command(DebugMsg::WriteRegisters(regs))? {
247259
DebugResponse::WriteRegisters => Ok(()),
260+
DebugResponse::NotAllowed => {
261+
log::error!("Action not allowed at this time, crash might have occurred");
262+
// This is a consequence of the target crashing or being in an invalid state
263+
// we cannot continue execution, but we can still read registers and memory
264+
Ok(())
265+
}
248266
DebugResponse::ErrorOccurred => {
249267
log::error!("Error occurred");
250268
Err(TargetError::NonFatal)
@@ -301,6 +319,12 @@ impl HwBreakpoint for HyperlightSandboxTarget {
301319

302320
match self.send_command(DebugMsg::AddHwBreakpoint(addr))? {
303321
DebugResponse::AddHwBreakpoint(rsp) => Ok(rsp),
322+
DebugResponse::NotAllowed => {
323+
log::error!("Action not allowed at this time, crash might have occurred");
324+
// This is a consequence of the target crashing or being in an invalid state
325+
// we cannot continue execution, but we can still read registers and memory
326+
Err(TargetError::NonFatal)
327+
}
304328
DebugResponse::ErrorOccurred => {
305329
log::error!("Error occurred");
306330
Err(TargetError::NonFatal)
@@ -321,6 +345,12 @@ impl HwBreakpoint for HyperlightSandboxTarget {
321345

322346
match self.send_command(DebugMsg::RemoveHwBreakpoint(addr))? {
323347
DebugResponse::RemoveHwBreakpoint(rsp) => Ok(rsp),
348+
DebugResponse::NotAllowed => {
349+
log::error!("Action not allowed at this time, crash might have occurred");
350+
// This is a consequence of the target crashing or being in an invalid state
351+
// we cannot continue execution, but we can still read registers and memory
352+
Err(TargetError::NonFatal)
353+
}
324354
DebugResponse::ErrorOccurred => {
325355
log::error!("Error occurred");
326356
Err(TargetError::NonFatal)
@@ -343,6 +373,12 @@ impl SwBreakpoint for HyperlightSandboxTarget {
343373

344374
match self.send_command(DebugMsg::AddSwBreakpoint(addr))? {
345375
DebugResponse::AddSwBreakpoint(rsp) => Ok(rsp),
376+
DebugResponse::NotAllowed => {
377+
log::error!("Action not allowed at this time, crash might have occurred");
378+
// This is a consequence of the target crashing or being in an invalid state
379+
// we cannot continue execution, but we can still read registers and memory
380+
Err(TargetError::NonFatal)
381+
}
346382
DebugResponse::ErrorOccurred => {
347383
log::error!("Error occurred");
348384
Err(TargetError::NonFatal)
@@ -363,6 +399,12 @@ impl SwBreakpoint for HyperlightSandboxTarget {
363399

364400
match self.send_command(DebugMsg::RemoveSwBreakpoint(addr))? {
365401
DebugResponse::RemoveSwBreakpoint(rsp) => Ok(rsp),
402+
DebugResponse::NotAllowed => {
403+
log::error!("Action not allowed at this time, crash might have occurred");
404+
// This is a consequence of the target crashing or being in an invalid state
405+
// we cannot continue execution, but we can still read registers and memory
406+
Err(TargetError::NonFatal)
407+
}
366408
DebugResponse::ErrorOccurred => {
367409
log::error!("Error occurred");
368410
Err(TargetError::NonFatal)
@@ -398,6 +440,12 @@ impl SingleThreadSingleStep for HyperlightSandboxTarget {
398440
log::error!("Error occurred");
399441
Err(GdbTargetError::UnexpectedError)
400442
}
443+
DebugResponse::NotAllowed => {
444+
log::error!("Action not allowed at this time, crash might have occurred");
445+
// This is a consequence of the target crashing or being in an invalid state
446+
// we cannot continue execution, but we can still read registers and memory
447+
Ok(())
448+
}
401449
msg => {
402450
log::error!("Unexpected message received: {:?}", msg);
403451
Err(GdbTargetError::UnexpectedMessage)

src/hyperlight_host/src/hypervisor/hyperv_linux.rs

Lines changed: 137 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,9 @@ use {super::crashdump, std::path::Path};
5555

5656
use super::fpu::{FP_CONTROL_WORD_DEFAULT, FP_TAG_WORD_DEFAULT, MXCSR_DEFAULT};
5757
#[cfg(gdb)]
58-
use super::gdb::{DebugCommChannel, DebugMsg, DebugResponse, GuestDebug, MshvDebug};
58+
use super::gdb::{
59+
DebugCommChannel, DebugMsg, DebugResponse, GuestDebug, MshvDebug, VcpuStopReason,
60+
};
5961
#[cfg(gdb)]
6062
use super::handlers::DbgMemAccessHandlerWrapper;
6163
use super::handlers::{MemAccessHandlerWrapper, OutBHandlerWrapper};
@@ -749,6 +751,25 @@ impl Hypervisor for HypervLinuxDriver {
749751
.store(false, Ordering::Relaxed);
750752
HyperlightExit::Cancelled()
751753
} else {
754+
// In case of the gdb feature, if no cancellation was requested,
755+
// and the debugging is enabled it means the vCPU was stopped because
756+
// of an interrupt coming from the debugger thread
757+
#[cfg(gdb)]
758+
if self.debug.is_some() {
759+
// If the vCPU was stopped because of an interrupt, we need to
760+
// return a special exit reason so that the gdb thread can handle it
761+
// and resume execution
762+
// NOTE: There is a chance that the vCPU was stopped because of a stale
763+
// signal that was meant to be delivered to a previous/other vCPU on this
764+
// same thread, however, we cannot distinguish between the two cases, so
765+
// we assume that the vCPU was stopped because of an interrupt.
766+
// This is fine, because the debugger will be notified about an interrupt
767+
HyperlightExit::Debug(VcpuStopReason::Interrupt)
768+
} else {
769+
HyperlightExit::Retry()
770+
}
771+
772+
#[cfg(not(gdb))]
752773
HyperlightExit::Retry()
753774
}
754775
}
@@ -835,39 +856,130 @@ impl Hypervisor for HypervLinuxDriver {
835856
dbg_mem_access_fn: std::sync::Arc<
836857
std::sync::Mutex<dyn super::handlers::DbgMemAccessHandlerCaller>,
837858
>,
838-
stop_reason: super::gdb::VcpuStopReason,
859+
stop_reason: VcpuStopReason,
839860
) -> Result<()> {
840-
self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
841-
.map_err(|e| new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e))?;
861+
if self.debug.is_none() {
862+
return Err(new_error!("Debugging is not enabled"));
863+
}
842864

843-
loop {
844-
log::debug!("Debug wait for event to resume vCPU");
865+
match stop_reason {
866+
// If the vCPU stopped because of a crash, we need to handle it differently
867+
// We do not want to allow resuming execution or placing breakpoints
868+
// because the guest has crashed.
869+
// We only allow reading registers and memory
870+
VcpuStopReason::Crash => {
871+
self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
872+
.map_err(|e| {
873+
new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e)
874+
})?;
875+
876+
loop {
877+
log::debug!("Debug wait for event to resume vCPU");
878+
// Wait for a message from gdb
879+
let req = self.recv_dbg_msg()?;
880+
881+
// Flag to store if we should deny continue or step requests
882+
let mut deny_continue = false;
883+
// Flag to store if we should detach from the gdb session
884+
let mut detach = false;
885+
886+
let response = match req {
887+
// Allow the detach request to disable debugging by continuing resuming
888+
// hypervisor crash error reporting
889+
DebugMsg::DisableDebug => {
890+
detach = true;
891+
DebugResponse::DisableDebug
892+
}
893+
// Do not allow continue or step requests
894+
DebugMsg::Continue | DebugMsg::Step => {
895+
deny_continue = true;
896+
DebugResponse::NotAllowed
897+
}
898+
// Do not allow adding/removing breakpoints and writing to memory or registers
899+
DebugMsg::AddHwBreakpoint(_)
900+
| DebugMsg::AddSwBreakpoint(_)
901+
| DebugMsg::RemoveHwBreakpoint(_)
902+
| DebugMsg::RemoveSwBreakpoint(_)
903+
| DebugMsg::WriteAddr(_, _)
904+
| DebugMsg::WriteRegisters(_) => DebugResponse::NotAllowed,
905+
906+
// For all other requests, we will process them normally
907+
_ => {
908+
let result = self.process_dbg_request(req, dbg_mem_access_fn.clone());
909+
match result {
910+
Ok(response) => response,
911+
Err(HyperlightError::TranslateGuestAddress(_)) => {
912+
// Treat non fatal errors separately so the guest doesn't fail
913+
DebugResponse::ErrorOccurred
914+
}
915+
Err(e) => {
916+
log::error!("Error processing debug request: {:?}", e);
917+
return Err(e);
918+
}
919+
}
920+
}
921+
};
845922

846-
// Wait for a message from gdb
847-
let req = self.recv_dbg_msg()?;
923+
// Send the response to the request back to gdb
924+
self.send_dbg_msg(response)
925+
.map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
848926

849-
let result = self.process_dbg_request(req, dbg_mem_access_fn.clone());
927+
// If we are denying continue or step requests, the debugger assumes the
928+
// execution started so we need to report a stop reason as a crash and let
929+
// it request to read registers/memory to figure out what happened
930+
if deny_continue {
931+
self.send_dbg_msg(DebugResponse::VcpuStopped(VcpuStopReason::Crash))
932+
.map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
933+
}
850934

851-
let response = match result {
852-
Ok(response) => response,
853-
// Treat non fatal errors separately so the guest doesn't fail
854-
Err(HyperlightError::TranslateGuestAddress(_)) => DebugResponse::ErrorOccurred,
855-
Err(e) => {
856-
return Err(e);
935+
// If we are detaching, we will break the loop and the Hypervisor will continue
936+
// to handle the Crash reason
937+
if detach {
938+
break;
939+
}
857940
}
858-
};
941+
}
942+
// If the vCPU stopped because of any other reason except a crash, we can handle it
943+
// normally
944+
_ => {
945+
// Send the stop reason to the gdb thread
946+
self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
947+
.map_err(|e| {
948+
new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e)
949+
})?;
950+
951+
loop {
952+
log::debug!("Debug wait for event to resume vCPU");
953+
// Wait for a message from gdb
954+
let req = self.recv_dbg_msg()?;
955+
956+
let result = self.process_dbg_request(req, dbg_mem_access_fn.clone());
957+
958+
let response = match result {
959+
Ok(response) => response,
960+
// Treat non fatal errors separately so the guest doesn't fail
961+
Err(HyperlightError::TranslateGuestAddress(_)) => {
962+
DebugResponse::ErrorOccurred
963+
}
964+
Err(e) => {
965+
return Err(e);
966+
}
967+
};
859968

860-
// If the command was either step or continue, we need to run the vcpu
861-
let cont = matches!(
862-
response,
863-
DebugResponse::Step | DebugResponse::Continue | DebugResponse::DisableDebug
864-
);
969+
let cont = matches!(
970+
response,
971+
DebugResponse::Continue | DebugResponse::Step | DebugResponse::DisableDebug
972+
);
865973

866-
self.send_dbg_msg(response)
867-
.map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
974+
self.send_dbg_msg(response)
975+
.map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
868976

869-
if cont {
870-
break;
977+
// Check if we should continue execution
978+
// We continue if the response is one of the following: Step, Continue, or DisableDebug
979+
if cont {
980+
break;
981+
}
982+
}
871983
}
872984
}
873985

0 commit comments

Comments
 (0)