Skip to content

Commit

Permalink
feat: add health check for a minimal memory / disk size
Browse files Browse the repository at this point in the history
This PR adds two additional checks which are performed during boot sequence and in `talosctl health`. They ensure that nodes have enough memory and disk.

- Boot check will print a warning if memory / disk size is not sufficient.
- Health check will fail if memory / disk size is not sufficient.

Closes #6467

Signed-off-by: Dmitriy Matrenichev <dmitry.matrenichev@siderolabs.com>
  • Loading branch information
DmitriyMV committed Dec 10, 2022
1 parent d04970d commit eb332cf
Show file tree
Hide file tree
Showing 5 changed files with 415 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,12 @@ func (*Sequencer) Boot(r runtime.Runtime) []runtime.Phase {
).Append(
"saveConfig",
SaveConfig,
).Append(
"memorySizeCheck",
MemorySizeCheck,
).Append(
"diskSizeCheck",
DiskSizeCheck,
).Append(
"env",
SetUserEnvVars,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ import (
cgroupsv2 "github.com/containerd/cgroups/v2"
"github.com/cosi-project/runtime/pkg/resource"
"github.com/cosi-project/runtime/pkg/state"
multierror "github.com/hashicorp/go-multierror"
"github.com/dustin/go-humanize"
"github.com/hashicorp/go-multierror"
"github.com/opencontainers/runtime-spec/specs-go"
pprocfs "github.com/prometheus/procfs"
"github.com/siderolabs/go-blockdevice/blockdevice"
"github.com/siderolabs/go-blockdevice/blockdevice/partition/gpt"
"github.com/siderolabs/go-blockdevice/blockdevice/util"
Expand Down Expand Up @@ -73,6 +75,7 @@ import (
resourcefiles "github.com/siderolabs/talos/pkg/machinery/resources/files"
"github.com/siderolabs/talos/pkg/machinery/resources/k8s"
resourceruntime "github.com/siderolabs/talos/pkg/machinery/resources/runtime"
"github.com/siderolabs/talos/pkg/minimal"
"github.com/siderolabs/talos/pkg/version"
)

Expand Down Expand Up @@ -681,6 +684,82 @@ func ValidateConfig(seq runtime.Sequence, data interface{}) (runtime.TaskExecuti
}, "validateConfig"
}

// MemorySizeCheck represents the MemorySizeCheck task.
func MemorySizeCheck(runtime.Sequence, any) (runtime.TaskExecutionFunc, string) {
return func(ctx context.Context, logger *log.Logger, r runtime.Runtime) error {
if r.State().Platform().Mode() == runtime.ModeContainer {
log.Println("skipping memory size check in the container")

return nil
}

pc, err := pprocfs.NewDefaultFS()
if err != nil {
return fmt.Errorf("failed to open procfs: %w", err)
}

info, err := pc.Meminfo()
if err != nil {
return fmt.Errorf("failed to read meminfo: %w", err)
}

minimum, recommended, err := minimal.Memory(r.Config().Machine().Type())
if err != nil {
return err
}

switch memTotal := pointer.SafeDeref(info.MemTotal) * humanize.KiByte; {
case memTotal < minimum:
log.Println("WARNING: memory size is less than recommended")
log.Println("WARNING: Talos may not work properly")
log.Println("WARNING: minimum memory size is", minimum/humanize.MiByte, "MiB")
log.Println("WARNING: recommended memory size is", recommended/humanize.MiByte, "MiB")
log.Println("WARNING: current total memory size is", memTotal/humanize.MiByte, "MiB")
case memTotal < recommended:
log.Println("NOTE: recommended memory size is", recommended/humanize.MiByte, "MiB")
log.Println("NOTE: current total memory size is", memTotal/humanize.MiByte, "MiB")
default:
log.Println("memory size is OK")
log.Println("memory size is", memTotal/humanize.MiByte, "MiB")
}

return nil
}, "memorySizeCheck"
}

// DiskSizeCheck represents the DiskSizeCheck task.
func DiskSizeCheck(runtime.Sequence, any) (runtime.TaskExecutionFunc, string) {
return func(ctx context.Context, logger *log.Logger, r runtime.Runtime) error {
if r.State().Platform().Mode() == runtime.ModeContainer {
log.Println("skipping disk size check in the container")

return nil
}

disk := r.State().Machine().Disk() // get ephemeral disk state
if disk == nil {
return fmt.Errorf("failed to get ephemeral disk state")
}

diskSize, err := disk.Size()
if err != nil {
return fmt.Errorf("failed to get ephemeral disk size: %w", err)
}

if minimum := minimal.DiskSize(); diskSize < minimum {
log.Println("WARNING: disk size is less than recommended")
log.Println("WARNING: Talos may not work properly")
log.Println("WARNING: minimum recommended disk size is", minimum/humanize.MiByte, "MiB")
log.Println("WARNING: current total disk size is", diskSize/humanize.MiByte, "MiB")
} else {
log.Println("disk size is OK")
log.Println("disk size is", diskSize/humanize.MiByte, "MiB")
}

return nil
}, "diskSizeCheck"
}

// SetUserEnvVars represents the SetUserEnvVars task.
func SetUserEnvVars(seq runtime.Sequence, data interface{}) (runtime.TaskExecutionFunc, string) {
return func(ctx context.Context, logger *log.Logger, r runtime.Runtime) (err error) {
Expand Down
14 changes: 14 additions & 0 deletions pkg/cluster/check/default.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,20 @@ func DefaultClusterChecks() []ClusterCheck {
}, 5*time.Minute, 5*time.Second)
},

// wait for all nodes to report their memory size
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all nodes memory sizes", func(ctx context.Context) error {
return AllNodesMemorySizes(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
},

// wait for all nodes to report their disk size
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all nodes disk sizes", func(ctx context.Context) error {
return AllNodesDiskSizes(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
},

// wait for kubelet to be healthy on all
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("kubelet to be healthy", func(ctx context.Context) error {
Expand Down
Loading

0 comments on commit eb332cf

Please sign in to comment.