add ALL config

google · Oct 7, 2024 · 67f52a4 · 67f52a4
1 parent c925837
commit 67f52a4
Show file tree

Hide file tree

Showing 9 changed files with 189 additions and 172 deletions.
diff --git a/launcher/container_runner.go b/launcher/container_runner.go
@@ -29,7 +29,6 @@ import (
 	"github.com/google/go-tpm-tools/cel"
 	"github.com/google/go-tpm-tools/client"
 	"github.com/google/go-tpm-tools/launcher/agent"
-	"github.com/google/go-tpm-tools/launcher/internal/healthmonitoring/nodeproblemdetector"
 	"github.com/google/go-tpm-tools/launcher/internal/signaturediscovery"
 	"github.com/google/go-tpm-tools/launcher/launcherfile"
 	"github.com/google/go-tpm-tools/launcher/registryauth"
@@ -521,16 +520,6 @@ func (r *ContainerRunner) Run(ctx context.Context) error {
 	go teeServer.Serve()
 	defer teeServer.Shutdown(ctx)
 
-	// start node-problem-detector.service to collect memory related metrics.
-	if r.launchSpec.MemoryMonitoringEnabled {
-		r.logger.Println("MemoryMonitoring is enabled by the VM operator")
-		if err := nodeproblemdetector.StartService(r.logger); err != nil {
-			return err
-		}
-	} else {
-		r.logger.Println("MemoryMonitoring is disabled by the VM operator")
-	}
-
 	var streamOpt cio.Opt
 	switch r.launchSpec.LogRedirect {
 	case spec.Nowhere:

diff --git a/launcher/internal/healthmonitoring/config.go b/launcher/internal/healthmonitoring/config.go
@@ -1,4 +1,4 @@
-package monitoring
+package healthmonitoring
 
 import (
 	"fmt"

diff --git a/launcher/internal/healthmonitoring/nodeproblemdetector/systemstats_config.go b/launcher/internal/healthmonitoring/nodeproblemdetector/systemstats_config.go
@@ -23,13 +23,20 @@ type statsConfig struct {
 	MetricsConfigs map[string]metricConfig `json:"metricsConfigs"`
 }
 
+type diskConfig struct {
+	IncludeAllAttachedBlk bool         `json:"includeAllAttachedBlk"`
+	IncludeRootBlk        bool         `json:includeRootBlk`
+	LsblkTimeout          string       `json:lsblkTimeout`
+	MetricsConfigs        *statsConfig `json:metricsConfigs`
+}
+
 // SystemStatsConfig contains configurations for `System Stats Monitor`,
 // a problem daemon in node-problem-detector that collects pre-defined health-related metrics from different system components.
 // For now we only consider collecting memory related metrics.
 // View the comprehensive configuration details on https://github.com/kubernetes/node-problem-detector/tree/master/pkg/systemstatsmonitor#detailed-configuration-options
 type SystemStatsConfig struct {
 	CPU            *statsConfig `json:"cpu,omitempty"`
-	Disk           *statsConfig `json:"disk,omitempty"`
+	Disk           *diskConfig  `json:"disk,omitempty"`
 	Host           *statsConfig `json:"host,omitempty"`
 	Memory         *statsConfig `json:"memory,omitempty"`
 	InvokeInterval string       `json:"invokeInterval,omitempty"`
@@ -43,25 +50,50 @@ func NewSystemStatsConfig() SystemStatsConfig {
 	}
 }
 
-var healthConfig = &SystemStatsConfig{
+var allConfig = &SystemStatsConfig{
 	CPU: &statsConfig{map[string]metricConfig{
-		"cpu/load_5m": {"cpu/load_5m"},
-	}},
-	Disk: &statsConfig{map[string]metricConfig{
-		"disk/percent_used": {"disk/percent_used"},
+		"cpu/runnable_task_count": {"cpu/runnable_task_count"},
+		"cpu/usage_time":          {"cpu/usage_time"},
+		"cpu/load_1m":             {"cpu/load_1m"},
+		"cpu/load_5m":             {"cpu/load_5m"},
+		"cpu/load_15m":            {"cpu/load_15m"},
+		"system/cpu_stat":         {"system/cpu_stat"},
+		"system/interrupts_total": {"system/interrupts_total"},
+		"system/processes_total":  {"system/processes_total"},
+		"system/procs_blocked":    {"system/procs_blocked"},
+		"system/procs_running":    {"system/procs_running"},
 	}},
+	Disk: &diskConfig{
+		true, true, "5s",
+		&statsConfig{map[string]metricConfig{
+			"disk/avg_queue_len":          {"disk/avg_queue_len"},
+			"disk/bytes_used":             {"disk/bytes_used"},
+			"disk/percent_used":           {"disk/percent_used"},
+			"disk/io_time":                {"disk/io_time"},
+			"disk/merged_operation_count": {"disk/merged_operation_count"},
+			"disk/operation_bytes_count":  {"disk/operation_bytes_count"},
+			"disk/operation_count":        {"disk/operation_count"},
+			"disk/operation_time":         {"disk/operation_time"},
+			"disk/weighted_io":            {"disk/weighted_io"},
+		}},
+	},
 	Host: &statsConfig{map[string]metricConfig{
 		"host/uptime": {"host/uptime"},
 	}},
 	Memory: &statsConfig{map[string]metricConfig{
-		"memory/bytes_used": {"memory/bytes_used"},
+		"memory/anonymous_used":   {"memory/anonymous_used"},
+		"memory/bytes_used":       {"memory/bytes_used"},
+		"memory/dirty_used":       {"memory/dirty_used"},
+		"memory/page_cache_used":  {"memory/page_cache_used"},
+		"memory/unevictable_used": {"memory/unevictable_used"},
+		"memory/percent_used":     {"memory/percent_used"},
 	}},
 	InvokeInterval: defaultInvokeIntervalString,
 }
 
-// EnableHealthMonitoringConfig overwrites system stats config with health monitoring config.
-func EnableHealthMonitoringConfig() error {
-	return healthConfig.WriteFile(systemStatsFilePath)
+// EnableAllConfig overwrites system stats config with health monitoring config.
+func EnableAllConfig() error {
+	return allConfig.WriteFile(systemStatsFilePath)
 }
 
 // EnableMemoryBytesUsed enables "memory/bytes_used" for memory monitoring.

diff --git a/launcher/internal/healthmonitoring/nodeproblemdetector/systemstats_config_test.go b/launcher/internal/healthmonitoring/nodeproblemdetector/systemstats_config_test.go
@@ -16,12 +16,12 @@ func TestEnableHealthMonitoringConfig(t *testing.T) {
 	tmpDir := t.TempDir()
 	systemStatsFilePath = path.Join(tmpDir, "system-stats-monitor.json")
 
-	wantBytes, err := json.Marshal(healthConfig)
+	wantBytes, err := json.Marshal(allConfig)
 	if err != nil {
 		t.Fatalf("Error marshaling expected config: %v", err)
 	}
 
-	EnableHealthMonitoringConfig()
+	EnableAllConfig()
 
 	file, err := os.OpenFile(systemStatsFilePath, os.O_RDONLY, 0)
 	if err != nil {

diff --git a/launcher/launcher/main.go b/launcher/launcher/main.go
@@ -96,12 +96,14 @@ func main() {
 		return
 	}
 
-	if launchSpec.HealthMonitoringEnabled {
-		logger.Printf("Health Monitoring is enabled by the VM operator")
+	if launchSpec.MonitoringEnabled != spec.None {
+		logger.Printf("Monitoring is enabled by the VM operator")
 
-		if err := nodeproblemdetector.EnableHealthMonitoringConfig(); err != nil {
-			logger.Printf("failed to enable Health Monitoring config: %v", err)
-			return
+		if launchSpec.MonitoringEnabled == spec.All {
+			if err := nodeproblemdetector.EnableAllConfig(); err != nil {
+				logger.Printf("failed to enable Health Monitoring config: %v", err)
+				return
+			}
 		}
 
 		if err := nodeproblemdetector.StartService(logger); err != nil {

diff --git a/launcher/spec/launch_policy.go b/launcher/spec/launch_policy.go
@@ -7,8 +7,6 @@ import (
 	"path/filepath"
 	"strconv"
 	"strings"
-
-	"github.com/google/go-tpm-tools/launcher/internal/healthmonitoring/monitoring"
 )
 
 // LaunchPolicy contains policies on starting the container.
@@ -18,8 +16,8 @@ type LaunchPolicy struct {
 	AllowedCmdOverride       bool
 	AllowedLogRedirect       policy
 	AllowedMountDestinations []string
-	HardenedImageMonitoring  monitoring.Config
-	DebugImageMonitoring     monitoring.Config
+	HardenedImageMonitoring  MonitoringType
+	DebugImageMonitoring     MonitoringType
 }
 
 type policy int
@@ -30,14 +28,27 @@ const (
 	never
 )
 
-type monitoringType int
+type MonitoringType int
 
 const (
-	none monitoringType = iota
-	memoryOnly
-	health
+	None MonitoringType = iota
+	MemoryOnly
+	All
 )
 
+func toMonitoringType(s string) (MonitoringType, error) {
+	switch strings.ToLower(s) {
+	case "none":
+		return None, nil
+	case "memoryonly":
+		return MemoryOnly, nil
+	case "all":
+		return All, nil
+	}
+
+	return None, fmt.Errorf("invalid monitoring type %v", s)
+}
+
 // String returns LaunchPolicy details.
 func (p policy) String() string {
 	switch p {
@@ -102,40 +113,39 @@ func configureMonitoringPolicy(imageLabels map[string]string, launchPolicy *Laun
 
 		logger.Printf("%s will be deprecated, use %s and %s instead", memoryMonitoring, hardenedMonitoring, debugMonitoring)
 
-		memoryOnlyConfig := monitoring.Config{CPU: false, Disk: false, Host: false, Memory: false}
 		switch policy {
 		case always:
-			logger.Printf("%s=always, will be treated as %s=memory and %s=memory", memoryMonitoring, hardenedMonitoring, debugMonitoring)
-			launchPolicy.HardenedImageMonitoring = memoryOnlyConfig
-			launchPolicy.DebugImageMonitoring = memoryOnlyConfig
+			logger.Printf("%s=always, will be treated as %s=memory_only and %s=memory_only", memoryMonitoring, hardenedMonitoring, debugMonitoring)
+			launchPolicy.HardenedImageMonitoring = MemoryOnly
+			launchPolicy.DebugImageMonitoring = MemoryOnly
 		case never:
 			logger.Printf("%s=never, will be treated as %s=none and %s=none", memoryMonitoring, hardenedMonitoring, debugMonitoring)
-			launchPolicy.HardenedImageMonitoring = memoryOnlyConfig
-			launchPolicy.DebugImageMonitoring = monitoring.NoneConfig()
+			launchPolicy.HardenedImageMonitoring = None
+			launchPolicy.DebugImageMonitoring = None
 		case debugOnly:
 			logger.Printf("%s=debug_only, will be treated as %s=none and %s=memory", memoryMonitoring, hardenedMonitoring, debugMonitoring)
-			launchPolicy.HardenedImageMonitoring = monitoring.NoneConfig()
-			launchPolicy.DebugImageMonitoring = memoryOnlyConfig
+			launchPolicy.HardenedImageMonitoring = None
+			launchPolicy.DebugImageMonitoring = MemoryOnly
 		}
 		return nil
 	}
 
 	if hardenedOk {
-		launchPolicy.HardenedImageMonitoring, err = monitoring.ToConfig(hardenedVal)
+		launchPolicy.HardenedImageMonitoring, err = toMonitoringType(hardenedVal)
 		if err != nil {
 			return fmt.Errorf("invalid monitoring type for hardened image: %v", err)
 		}
 	} else {
-		launchPolicy.HardenedImageMonitoring = monitoring.NoneConfig()
+		launchPolicy.HardenedImageMonitoring = None
 	}
 
 	if debugOk {
-		launchPolicy.DebugImageMonitoring, err = monitoring.ToConfig(debugVal)
+		launchPolicy.DebugImageMonitoring, err = toMonitoringType(debugVal)
 		if err != nil {
 			return fmt.Errorf("invalid monitoring type for debug image: %v", err)
 		}
 	} else {
-		launchPolicy.DebugImageMonitoring = monitoring.AllConfig()
+		launchPolicy.DebugImageMonitoring = MemoryOnly
 	}
 
 	return nil
@@ -189,6 +199,22 @@ func GetLaunchPolicy(imageLabels map[string]string, logger *log.Logger) (LaunchP
 	return launchPolicy, nil
 }
 
+func verifyMonitoringConfig(policy MonitoringType, spec MonitoringType) error {
+	if policy == None {
+		if spec != None {
+			return fmt.Errorf("spec configured for %v but policy is none", spec)
+		}
+
+		return nil
+	} else if policy == MemoryOnly {
+		if spec == All {
+			return fmt.Errorf("spec configured all monitoring, policy only allows memory")
+		}
+	}
+
+	return nil
+}
+
 // Verify will use the LaunchPolicy to verify the given LaunchSpec. If the verification passed, will return nil.
 // If there are multiple violations, the function will return the first error.
 func (p LaunchPolicy) Verify(ls LaunchSpec) error {
@@ -214,8 +240,8 @@ func (p LaunchPolicy) Verify(ls LaunchSpec) error {
 		monitoringPolicy = p.HardenedImageMonitoring
 	}
 
-	if err := monitoring.CheckCompliance(monitoringPolicy, ls.MonitoringEnabled); err != nil {
-		return fmt.Errorf("error verifying monitoring configs: %v", err)
+	if err := verifyMonitoringConfig(monitoringPolicy, ls.MonitoringEnabled); err != nil {
+		return fmt.Errorf("error verifying monitoring config: %v", err)
 	}
 
 	var err error