Skip to content

Commit

Permalink
feat: introduce new flag in reset API that makes Talos reset user disks
Browse files Browse the repository at this point in the history
Fixes: #6815

Additionally, make it possible to run reset in maintenance mode: to
enable a way for resetting system disk and remove all traces of Talos
from it.

The new reset flow works in a separate sequence, changed disk probe
lookup to check the boot partition instead of the ephemeral one.

Signed-off-by: Artem Chernyshev <artem.chernyshev@talos-systems.com>
  • Loading branch information
Unix4ever committed Feb 28, 2023
1 parent f55f5df commit b520710
Show file tree
Hide file tree
Showing 18 changed files with 2,144 additions and 1,620 deletions.
9 changes: 9 additions & 0 deletions api/machine/machine.proto
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,11 @@ message ResetPartitionSpec {
}

message ResetRequest {
enum WipeMode {
ALL = 0;
SYSTEM_DISK = 1;
USER_DISKS = 2;
}
// Graceful indicates whether node should leave etcd before the upgrade, it also
// enforces etcd checks before leaving.
bool graceful = 1;
Expand All @@ -294,6 +299,10 @@ message ResetRequest {
// System_partitions_to_wipe lists specific system disk partitions to be reset (wiped).
// If system_partitions_to_wipe is empty, all the partitions are erased.
repeated ResetPartitionSpec system_partitions_to_wipe = 3;
// UserDisksToWipe lists specific connected block devices to be reset (wiped).
repeated string user_disks_to_wipe = 4;
// WipeMode defines which devices should be wiped.
WipeMode mode = 5;
}

// The reset message containing the restart status.
Expand Down
2 changes: 2 additions & 0 deletions api/storage/storage.proto
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ message Disk {
DiskType type = 9;
// BusPath is the bus path of the disk.
string bus_path = 10;
// SystemDisk indicates that the disk is used as Talos system disk.
bool system_disk = 11;
}

// DisksResponse represents the response of the `Disks` RPC.
Expand Down
8 changes: 8 additions & 0 deletions cmd/talosctl/cmd/talos/disks.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ func printDisks(ctx context.Context, c *client.Client) error {
"NAME",
"SIZE",
"BUS_PATH",
"SYSTEM_DISK",
}, "\t")

getWithPlaceholder := func(in string) string {
Expand Down Expand Up @@ -95,6 +96,12 @@ func printDisks(ctx context.Context, c *client.Client) error {
args = append(args, node)
}

isSystemDisk := ""

if disk.SystemDisk {
isSystemDisk = "*"
}

args = append(args, []interface{}{
getWithPlaceholder(disk.DeviceName),
getWithPlaceholder(disk.Model),
Expand All @@ -106,6 +113,7 @@ func printDisks(ctx context.Context, c *client.Client) error {
getWithPlaceholder(disk.Name),
humanize.Bytes(disk.Size),
getWithPlaceholder(disk.BusPath),
isSystemDisk,
}...)

pattern := strings.Repeat("%s\t", len(args))
Expand Down
73 changes: 71 additions & 2 deletions cmd/talosctl/cmd/talos/reset.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ package talos
import (
"context"
"fmt"
"sort"
"strings"

"github.com/siderolabs/gen/maps"
"github.com/spf13/cobra"

"github.com/siderolabs/talos/cmd/talosctl/pkg/talos/action"
Expand All @@ -16,10 +19,61 @@ import (
"github.com/siderolabs/talos/pkg/machinery/client"
)

var wipeOptions = map[string]machineapi.ResetRequest_WipeMode{
wipeModeAll: machineapi.ResetRequest_ALL,
wipeModeSystemDisk: machineapi.ResetRequest_SYSTEM_DISK,
wipeModeUserDisks: machineapi.ResetRequest_USER_DISKS,
}

// WipeMode apply, patch, edit config update mode.
type WipeMode machineapi.ResetRequest_WipeMode

const (
wipeModeAll = "all"
wipeModeSystemDisk = "system-disk"
wipeModeUserDisks = "user-disks"
)

func (m WipeMode) String() string {
switch machineapi.ResetRequest_WipeMode(m) {
case machineapi.ResetRequest_ALL:
return wipeModeAll
case machineapi.ResetRequest_SYSTEM_DISK:
return wipeModeSystemDisk
case machineapi.ResetRequest_USER_DISKS:
return wipeModeUserDisks
}

return wipeModeAll
}

// Set implements Flag interface.
func (m *WipeMode) Set(value string) error {
mode, ok := wipeOptions[value]
if !ok {
return fmt.Errorf("possible options are: %s", m.Type())
}

*m = WipeMode(mode)

return nil
}

// Type implements Flag interface.
func (m *WipeMode) Type() string {
options := maps.Keys(wipeOptions)
sort.Strings(options)

return strings.Join(options, ", ")
}

var resetCmdFlags struct {
trackableActionCmdFlags
graceful bool
reboot bool
insecure bool
wipeMode WipeMode
userDisksToWipe []string
systemLabelsToWipe []string
}

Expand All @@ -36,8 +90,12 @@ var resetCmd = &cobra.Command{

resetRequest := buildResetRequest()

if resetCmdFlags.wait && resetCmdFlags.insecure {
return fmt.Errorf("cannot use --wait and --insecure together")
}

if !resetCmdFlags.wait {
return WithClient(func(ctx context.Context, c *client.Client) error {
resetNoWait := func(ctx context.Context, c *client.Client) error {
if err := helpers.ClientVersionCheck(ctx, c); err != nil {
return err
}
Expand All @@ -47,7 +105,13 @@ var resetCmd = &cobra.Command{
}

return nil
})
}

if resetCmdFlags.insecure {
return WithClientMaintenance(nil, resetNoWait)
}

return WithClient(resetNoWait)
}

actionFn := func(ctx context.Context, c *client.Client) (string, error) {
Expand Down Expand Up @@ -93,6 +157,8 @@ func buildResetRequest() *machineapi.ResetRequest {
return &machineapi.ResetRequest{
Graceful: resetCmdFlags.graceful,
Reboot: resetCmdFlags.reboot,
UserDisksToWipe: resetCmdFlags.userDisksToWipe,
Mode: machineapi.ResetRequest_WipeMode(resetCmdFlags.wipeMode),
SystemPartitionsToWipe: systemPartitionsToWipe,
}
}
Expand All @@ -113,6 +179,9 @@ func resetGetActorID(ctx context.Context, c *client.Client, req *machineapi.Rese
func init() {
resetCmd.Flags().BoolVar(&resetCmdFlags.graceful, "graceful", true, "if true, attempt to cordon/drain node and leave etcd (if applicable)")
resetCmd.Flags().BoolVar(&resetCmdFlags.reboot, "reboot", false, "if true, reboot the node after resetting instead of shutting down")
resetCmd.Flags().BoolVar(&resetCmdFlags.insecure, "insecure", false, "reset using the insecure (encrypted with no auth) maintenance service")
resetCmd.Flags().Var(&resetCmdFlags.wipeMode, "wipe-mode", "disk reset mode")
resetCmd.Flags().StringSliceVar(&resetCmdFlags.userDisksToWipe, "user-disks-to-wipe", nil, "if set, wipes defined devices in the list")
resetCmd.Flags().StringSliceVar(&resetCmdFlags.systemLabelsToWipe, "system-labels-to-wipe", nil, "if set, just wipe selected system disk partitions by label but keep other partitions intact")
resetCmdFlags.addTrackActionFlags(resetCmd)
addCommand(resetCmd)
Expand Down
9 changes: 9 additions & 0 deletions hack/release.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,15 @@ To switch TTYs, use the `Alt+F1` through `Alt+F2` keys.
You can disable this behavior by setting the kernel parameter `talos.dashboard.disabled=1`.
This behavior is disabled by default on SBCs.
"""

[notes.reset]
title = "Reset API Enhancements"
description="""\
Talos now supports resetting user disks through the Reset API,
the list of disks to wipe is set using the `--user-disks-to-wipe` parameter in `talosctl`.
Additionally, the Reset API can now function in maintenance mode
and has the capability to wipe the node's system disk (partial wipe is not supported).
"""

[make_deps]
Expand Down
44 changes: 42 additions & 2 deletions internal/app/machined/internal/server/v1alpha1/v1alpha1_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
"github.com/rs/xid"
"github.com/siderolabs/gen/slices"
"github.com/siderolabs/go-blockdevice/blockdevice/partition/gpt"
bddisk "github.com/siderolabs/go-blockdevice/blockdevice/util/disk"
"github.com/siderolabs/go-kmsg"
"github.com/siderolabs/go-pointer"
"go.etcd.io/etcd/api/v3/etcdserverpb"
Expand Down Expand Up @@ -150,7 +151,7 @@ func (s *Server) Register(obj *grpc.Server) {
resource.RegisterResourceServiceServer(obj, &resources.Server{Resources: resourceState}) //nolint:staticcheck
cosiv1alpha1.RegisterStateServer(obj, server.NewState(resourceState))
inspect.RegisterInspectServiceServer(obj, &InspectServer{server: s})
storage.RegisterStorageServiceServer(obj, &storaged.Server{})
storage.RegisterStorageServiceServer(obj, &storaged.Server{Controller: s.Controller})
timeapi.RegisterTimeServiceServer(obj, &TimeServer{ConfigProvider: s.Controller.Runtime()})
}

Expand Down Expand Up @@ -600,7 +601,7 @@ func (opt *ResetOptions) GetSystemDiskTargets() []runtime.PartitionTarget {

// Reset resets the node.
//
//nolint:gocyclo
//nolint:gocyclo,cyclop
func (s *Server) Reset(ctx context.Context, in *machine.ResetRequest) (reply *machine.ResetResponse, err error) {
actorID := uuid.New().String()

Expand All @@ -610,7 +611,46 @@ func (s *Server) Reset(ctx context.Context, in *machine.ResetRequest) (reply *ma
ResetRequest: in,
}

if len(in.GetUserDisksToWipe()) > 0 {
if in.Mode == machine.ResetRequest_SYSTEM_DISK {
return nil, fmt.Errorf("reset failed: invalid input, wipe mode SYSTEM_DISK doesn't support UserDisksToWipe parameter")
}

var diskList []*bddisk.Disk

diskList, err = bddisk.List()
if err != nil {
return nil, err
}

disks := slices.ToMap(diskList, func(disk *bddisk.Disk) (string, *bddisk.Disk) {
return disk.DeviceName, disk
})

systemDisk := s.Controller.Runtime().State().Machine().Disk()

// validate input
for _, deviceName := range in.GetUserDisksToWipe() {
disk, ok := disks[deviceName]
if !ok {
return nil, fmt.Errorf("reset user disk failed: device %s wasn't found", deviceName)
}

if disk.ReadOnly {
return nil, fmt.Errorf("reset user disk failed: device %s is readonly", deviceName)
}

if systemDisk != nil && deviceName == systemDisk.Device().Name() {
return nil, fmt.Errorf("reset user disk failed: device %s is the system disk", deviceName)
}
}
}

if len(in.GetSystemPartitionsToWipe()) > 0 {
if in.Mode == machine.ResetRequest_USER_DISKS {
return nil, fmt.Errorf("reset failed: invalid input, wipe mode USER_DISKS doesn't support SystemPartitionsToWipe parameter")
}

bd := s.Controller.Runtime().State().Machine().Disk().BlockDevice

var pt *gpt.GPT
Expand Down
4 changes: 4 additions & 0 deletions internal/app/machined/pkg/runtime/sequencer.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ func ParseSequence(s string) (seq Sequence, err error) {
seq = SequenceUpgrade
case stageUpgrade:
seq = SequenceStageUpgrade
case maintenanceUpgrade:
seq = SequenceMaintenanceUpgrade
case reset:
seq = SequenceReset
case reboot:
Expand All @@ -126,6 +128,8 @@ func ParseSequence(s string) (seq Sequence, err error) {
type ResetOptions interface {
GetGraceful() bool
GetReboot() bool
GetMode() machine.ResetRequest_WipeMode
GetUserDisksToWipe() []string
GetSystemDiskTargets() []PartitionTarget
}

Expand Down
25 changes: 23 additions & 2 deletions internal/app/machined/pkg/runtime/v1alpha1/v1alpha1_sequencer.go
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ func (*Sequencer) Reboot(r runtime.Runtime) []runtime.Phase {
}

// Reset is the reset sequence.
//
//nolint:gocyclo,cyclop
func (*Sequencer) Reset(r runtime.Runtime, in runtime.ResetOptions) []runtime.Phase {
phases := PhaseList{}

Expand All @@ -312,6 +314,21 @@ func (*Sequencer) Reset(r runtime.Runtime, in runtime.ResetOptions) []runtime.Ph
withKexec = !bootPartitionInTargets(in.GetSystemDiskTargets())
}

var (
resetUserDisks bool
resetSystemDisk bool
)

switch in.GetMode() {
case machineapi.ResetRequest_ALL:
resetUserDisks = true
resetSystemDisk = true
case machineapi.ResetRequest_USER_DISKS:
resetUserDisks = true
case machineapi.ResetRequest_SYSTEM_DISK:
resetSystemDisk = true
}

switch r.State().Platform().Mode() { //nolint:exhaustive
case runtime.ModeContainer:
phases = phases.AppendList(stopAllPhaselist(r, false)).
Expand Down Expand Up @@ -345,13 +362,17 @@ func (*Sequencer) Reset(r runtime.Runtime, in runtime.ResetOptions) []runtime.Ph
"forceCleanup",
ForceCleanup,
).AppendWhen(
len(in.GetSystemDiskTargets()) == 0,
len(in.GetSystemDiskTargets()) == 0 && resetSystemDisk,
"reset",
ResetSystemDisk,
).AppendWhen(
len(in.GetSystemDiskTargets()) > 0,
len(in.GetSystemDiskTargets()) > 0 && resetSystemDisk,
"resetSpec",
ResetSystemDiskSpec,
).AppendWhen(
len(in.GetUserDisksToWipe()) > 0 && resetUserDisks,
"resetUserDisks",
ResetUserDisks,
).AppendWhen(
in.GetReboot(),
"reboot",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1616,7 +1616,13 @@ func ResetSystemDisk(seq runtime.Sequence, data interface{}) (runtime.TaskExecut
return func(ctx context.Context, logger *log.Logger, r runtime.Runtime) (err error) {
var dev *blockdevice.BlockDevice

dev, err = blockdevice.Open(r.State().Machine().Disk().Device().Name())
disk := r.State().Machine().Disk()

if disk == nil {
return nil
}

dev, err = blockdevice.Open(disk.Device().Name())
if err != nil {
return err
}
Expand All @@ -1627,6 +1633,36 @@ func ResetSystemDisk(seq runtime.Sequence, data interface{}) (runtime.TaskExecut
}, "resetSystemDisk"
}

// ResetUserDisks represents the task to reset the user disks.
func ResetUserDisks(seq runtime.Sequence, data interface{}) (runtime.TaskExecutionFunc, string) {
return func(ctx context.Context, logger *log.Logger, r runtime.Runtime) (err error) {
in, ok := data.(runtime.ResetOptions)
if !ok {
return fmt.Errorf("unexpected runtime data")
}

for _, deviceName := range in.GetUserDisksToWipe() {
var dev *blockdevice.BlockDevice

dev, err = blockdevice.Open(deviceName)
if err != nil {
return err
}

defer dev.Close() //nolint:errcheck

logger.Printf("wiping user disk %s", deviceName)

err = dev.FastWipe()
if err != nil {
return err
}
}

return nil
}, "resetUserDisks"
}

// ResetSystemDiskSpec represents the task to reset the system disk by spec.
func ResetSystemDiskSpec(seq runtime.Sequence, data interface{}) (runtime.TaskExecutionFunc, string) {
return func(ctx context.Context, logger *log.Logger, r runtime.Runtime) (err error) {
Expand Down
Loading

0 comments on commit b520710

Please sign in to comment.