Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 25 additions & 4 deletions tmt/steps/context/reboot.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ def handle_reboot(self, restart: Optional['RestartContext'] = None) -> bool:
:return: ``True`` when the reboot has taken place, ``False``
otherwise.
"""

if not self.requested:
return False

Expand Down Expand Up @@ -124,6 +123,7 @@ def handle_reboot(self, restart: Optional['RestartContext'] = None) -> bool:
reboot_data = json.loads(self.request_path.read_text())

reboot_command: Optional[ShellScript] = None
systemd_soft_reboot = reboot_data.get('systemd_soft_reboot') == 'true'

if reboot_data.get('command'):
with suppress(TypeError):
Expand All @@ -141,7 +141,14 @@ def handle_reboot(self, restart: Optional['RestartContext'] = None) -> bool:
self.guest.execute(ShellScript(f'rm -f {self.request_path}'))

try:
rebooted = self.guest.reboot(hard=False, command=reboot_command, waiting=waiting)
if systemd_soft_reboot:
rebooted = self.guest.reboot_systemd_soft(
command=reboot_command, waiting=waiting
)
else:
rebooted = self.guest.reboot(
hard=False, command=reboot_command, waiting=waiting
)

except tmt.utils.RunError:
if reboot_command is not None:
Expand All @@ -152,9 +159,23 @@ def handle_reboot(self, restart: Optional['RestartContext'] = None) -> bool:
raise

except tmt.steps.provision.RebootModeNotSupportedError:
self.logger.warning("Guest does not support soft reboot, trying hard reboot.")
if systemd_soft_reboot:
self.logger.warning(
"Guest does not support systemd soft reboot, trying regular soft reboot."
)

rebooted = self.guest.reboot(hard=True, waiting=waiting)
try:
rebooted = self.guest.reboot(
hard=False, command=reboot_command, waiting=waiting
)
except tmt.steps.provision.RebootModeNotSupportedError:
self.logger.warning(
"Guest does not support soft reboot, trying hard reboot."
)
rebooted = self.guest.reboot(hard=True, waiting=waiting)
else:
self.logger.warning("Guest does not support soft reboot, trying hard reboot.")
rebooted = self.guest.reboot(hard=True, waiting=waiting)

if not rebooted:
raise tmt.utils.RebootTimeoutError("Reboot timed out.")
Expand Down
2 changes: 2 additions & 0 deletions tmt/steps/execute/internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,8 @@ class ExecuteInternal(tmt.steps.execute.ExecutePlugin[ExecuteInternalData]):

``tmt-reboot`` - soft reboot the machine from inside the test. After reboot
the execution starts from the test which rebooted the machine.
Use ``tmt-reboot -s`` for systemd soft-reboot which preserves the kernel
and hardware state while restarting userspace only.
An environment variable ``TMT_REBOOT_COUNT`` is provided which
the test can use to handle the reboot. The variable holds the
number of reboots performed by the test. For more information
Expand Down
137 changes: 115 additions & 22 deletions tmt/steps/provision/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,17 @@ class GuestCapability(enum.Enum):
SYSLOG_ACTION_READ_CLEAR = 'syslog-action-read-clear'


class RebootMode(enum.Enum):
"""
Different reboot detection modes
"""

#: Standard reboot detection using boot time from /proc/stat
BOOT_TIME = 'boot_time'
#: Systemd soft reboot detection using boot ID from /proc/sys/kernel/random/boot_id
SYSTEMD_SOFT = 'systemd_soft'


@container
class GuestFacts(SerializableContainer):
"""
Expand Down Expand Up @@ -2072,6 +2083,56 @@ def reboot(

raise NotImplementedError

def reboot_systemd_soft(
self,
command: Optional[Union[Command, ShellScript]] = None,
waiting: Optional[Waiting] = None,
) -> bool:
"""
Perform a systemd soft reboot, and wait for the guest to recover.

Systemd soft reboot preserves the kernel and hardware state while
restarting userspace. The boot ID remains the same after a soft reboot.

:param command: a command to run on the guest to trigger the
soft reboot. Defaults to 'systemctl soft-reboot'.
:param waiting: amount of time in which the guest must become available
again.
:returns: ``True`` if the reboot succeeded, ``False`` otherwise.
"""

waiting = waiting or default_reboot_waiting()

# Default command for systemd soft reboot
if not command:
command = ShellScript('systemctl soft-reboot')

# Check if systemd soft-reboot is supported
try:
check_result = self.execute(
ShellScript('systemctl --help | grep -q "soft-reboot"'), silent=True
)
if check_result.returncode != 0:
raise tmt.steps.provision.RebootModeNotSupportedError(
guest=self,
hard=False,
message="systemctl soft-reboot is not supported on this guest",
)
except Exception as exc:
raise tmt.steps.provision.RebootModeNotSupportedError(
guest=self, hard=False, message="Cannot check for systemctl soft-reboot support"
) from exc

self.debug(f"Triggering systemd soft reboot with command '{command}'.")

# Use generalized perform_reboot with systemd soft reboot mode
return self.perform_reboot(
lambda: self.execute(command, silent=True),
waiting,
mode=RebootMode.SYSTEMD_SOFT,
fetch_reboot_marker=True,
)

def reconnect(
self,
wait: Optional[Waiting] = None,
Expand Down Expand Up @@ -3016,7 +3077,8 @@ def perform_reboot(
self,
action: Callable[[], Any],
wait: Waiting,
fetch_boot_time: bool = True,
mode: RebootMode = RebootMode.BOOT_TIME,
fetch_reboot_marker: bool = True,
) -> bool:
"""
Perform the actual reboot and wait for the guest to recover.
Expand All @@ -3027,15 +3089,11 @@ def perform_reboot(
:py:meth:`perform_reboot` with the right ``action`` callable.

:param action: a callable which will trigger the requested reboot.
:param timeout: amount of time in which the guest must become available
again.
:param tick: how many seconds to wait between two consecutive attempts
of contacting the guest.
:param tick_increase: a multiplier applied to ``tick`` after every
attempt.
:param fetch_boot_time: if set, the current boot time of the
guest would be read first, and used for testing whether the
reboot has been performed. This will require communication
:param wait: waiting configuration for guest recovery.
:param mode: reboot detection mode to use.
:param fetch_reboot_marker: if set, the current reboot marker
(boot time or boot ID) would be read first, and used for testing
whether the reboot has been performed. This will require communication
with the guest, therefore it is recommended to use ``False``
with hard reboot of unhealthy guests.
:returns: ``True`` if the reboot succeeded, ``False`` otherwise.
Expand All @@ -3056,7 +3114,26 @@ def get_boot_time() -> int:

return int(match.group(1))

current_boot_time = get_boot_time() if fetch_boot_time else 0
def get_boot_id() -> str:
"""
Reads boot ID from /proc/sys/kernel/random/boot_id
"""

result = self.execute(ShellScript('cat /proc/sys/kernel/random/boot_id'), silent=True)
return result.stdout.strip() if result.stdout else ""

# Get the current reboot marker based on mode
current_marker: Union[int, str] = 0
if fetch_reboot_marker:
if mode == RebootMode.BOOT_TIME:
current_marker = get_boot_time()
elif mode == RebootMode.SYSTEMD_SOFT:
try:
current_marker = get_boot_id()
except Exception:
self.debug("Could not get boot ID, falling back to boot time detection")
mode = RebootMode.BOOT_TIME
current_marker = get_boot_time()

self.debug(f"Triggering reboot with '{action}'.")

Expand All @@ -3071,25 +3148,41 @@ def get_boot_time() -> int:
else:
raise

# Wait until we get new boot time, connection will drop and will be
# unreachable for some time
def check_boot_time() -> None:
# Wait until reboot marker changes (or stays the same for systemd soft reboot)
def check_reboot_completion() -> None:
try:
new_boot_time = get_boot_time()

if new_boot_time != current_boot_time:
# Different boot time and we are reconnected
return
# Check if we can connect
self.execute(Command('whoami'), silent=True)

# Same boot time, reboot didn't happen yet, retrying
raise tmt.utils.wait.WaitingIncompleteError
# Get new marker based on mode
if mode == RebootMode.BOOT_TIME:
new_marker = get_boot_time()
if new_marker != current_marker:
# Different boot time and we are reconnected
return
# Same boot time, reboot didn't happen yet, retrying
raise tmt.utils.wait.WaitingIncompleteError

if mode == RebootMode.SYSTEMD_SOFT:
new_marker = get_boot_id()
# For systemd soft reboot, boot ID should stay the same
if new_marker == current_marker:
self.debug(
"Systemd soft reboot completed successfully (boot ID unchanged)."
)
return
self.debug(
f"Boot ID changed from {current_marker} to {new_marker}, "
"this might indicate a hard reboot occurred instead."
)
return # Still accept it as the guest is back up
Comment on lines +3166 to +3178
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Critical logic bug in systemd soft reboot detection. The code accepts when boot_id == current_marker immediately after reconnection succeeds, but this will cause false positives. If the connection doesn't drop quickly enough after triggering the reboot command, the first successful connection attempt will find the boot ID unchanged (because reboot hasn't happened yet) and incorrectly report success.

The code needs to:

  1. First ensure the connection has dropped (guest became unavailable)
  2. Then wait for reconnection
  3. Only then verify boot ID is unchanged

For example, add a flag to track if connection was lost:

connection_dropped = False
try:
    self.execute(Command('whoami'), silent=True)
    if not connection_dropped and mode == RebootMode.SYSTEMD_SOFT:
        # Still connected, reboot hasn't happened
        raise tmt.utils.wait.WaitingIncompleteError
except tmt.utils.RunError:
    connection_dropped = True
    raise tmt.utils.wait.WaitingIncompleteError

if mode == RebootMode.SYSTEMD_SOFT and connection_dropped:
    # Now check boot ID is same
Suggested change
if mode == RebootMode.SYSTEMD_SOFT:
new_marker = get_boot_id()
# For systemd soft reboot, boot ID should stay the same
if new_marker == current_marker:
self.debug(
"Systemd soft reboot completed successfully (boot ID unchanged)."
)
return
self.debug(
f"Boot ID changed from {current_marker} to {new_marker}, "
"this might indicate a hard reboot occurred instead."
)
return # Still accept it as the guest is back up
if mode == RebootMode.SYSTEMD_SOFT:
new_marker = get_boot_id()
# For systemd soft reboot, boot ID should stay the same
if connection_dropped and new_marker == current_marker:
self.debug(
"Systemd soft reboot completed successfully (boot ID unchanged)."
)
return
if connection_dropped:
self.debug(
f"Boot ID changed from {current_marker} to {new_marker}, "
"this might indicate a hard reboot occurred instead."
)
return # Still accept it as the guest is back up
# If we get here without connection_dropped, the reboot hasn't happened yet
raise tmt.utils.wait.WaitingIncompleteError

Spotted by Graphite Agent

Fix in Graphite


Is this helpful? React 👍 or 👎 to let us know.


except tmt.utils.RunError as error:
self.debug('Failed to connect to the guest.')
raise tmt.utils.wait.WaitingIncompleteError from error

try:
wait.wait(check_boot_time, self._logger)
wait.wait(check_reboot_completion, self._logger)

except tmt.utils.wait.WaitingTimedOutError:
self.debug("Connection to guest failed after reboot.")
Expand Down
15 changes: 12 additions & 3 deletions tmt/steps/scripts/tmt-reboot
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,22 @@ PATH=/sbin:/usr/sbin:$PATH

command=""
timeout=""
systemd_soft_reboot=""

while getopts "c:t:e" flag; do
while getopts "c:t:esh" flag; do
case "${flag}" in
c) command="${OPTARG}";;
t) timeout="${OPTARG}";;
e) logger -s "The '-e' option is noop as of tmt-1.58, see release docs for more information.";;
*) exit 1;;
s) systemd_soft_reboot="true";;
h) echo "Usage: tmt-reboot [-c COMMAND] [-t TIMEOUT] [-s] [-h]"
echo " -c COMMAND Custom reboot command to use"
echo " -t TIMEOUT Timeout in seconds"
echo " -s Use systemd soft-reboot (preserves kernel state)"
echo " -h Show this help message"
exit 0;;
*) echo "Usage: tmt-reboot [-c COMMAND] [-t TIMEOUT] [-s] [-h]"
exit 1;;
esac
done

Expand All @@ -47,4 +56,4 @@ if [[ -f /root/EFI_BOOT_ENTRY.TXT ]]; then
fi
fi

flock "$TMT_TEST_PIDFILE_LOCK" tmt-reboot-core "$command" "$timeout"
flock "$TMT_TEST_PIDFILE_LOCK" tmt-reboot-core "$command" "$timeout" "$systemd_soft_reboot"
8 changes: 7 additions & 1 deletion tmt/steps/scripts/tmt-reboot-core
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ fi

command="$1"
timeout="$2"
systemd_soft_reboot="$3"

# Thanks to being run while holding the pidfile lock, the file exists and should
# no go away. It should be safe to read test PID and reboot-request filepath from
Expand All @@ -19,7 +20,12 @@ read -r test_pid test_reboot_file < "$TMT_TEST_PIDFILE"

mkdir -p "$(dirname "$test_reboot_file")"

printf '{"command": "%s", "timeout": "%s"}' "$command" "$timeout" > "$test_reboot_file"
# Set default command for systemd soft-reboot if systemd_soft_reboot is requested and no custom command
if [[ "$systemd_soft_reboot" == "true" && -z "$command" ]]; then
command="systemctl soft-reboot"
fi

printf '{"command": "%s", "timeout": "%s", "systemd_soft_reboot": "%s"}' "$command" "$timeout" "$systemd_soft_reboot" > "$test_reboot_file"

# Flush buffers to prevent potential data loss
sync
Expand Down
1 change: 1 addition & 0 deletions tmt/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1059,6 +1059,7 @@ def get_output(self) -> Optional[str]:
class CommandOutput:
stdout: Optional[str]
stderr: Optional[str]
returncode: int = 0


class ShellScript:
Expand Down
Loading