Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support health monitoring mode for NPD #479

Merged
merged 27 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
311c127
enable health monitoring mode for NPD
jessieqliu Aug 23, 2024
00066e0
fix bugs in launchspec
jessieqliu Aug 23, 2024
f3ff5e0
fix test failures
jessieqliu Aug 23, 2024
adac472
fix lint errors
jessieqliu Aug 23, 2024
f4e2c45
more lint fixes
jessieqliu Aug 23, 2024
949dbd1
add policy and spec tests
jessieqliu Aug 26, 2024
a6551e0
fix lint error
jessieqliu Aug 26, 2024
7359627
Update cloudbuild test for memory monitoring
jessieqliu Aug 27, 2024
217a710
update policy for test workloads
jessieqliu Aug 27, 2024
cbd2e20
add backwards compatibility for memory_monitoring_allow policy:
jessieqliu Aug 29, 2024
53973a4
only one field can be specified in spec
jessieqliu Sep 3, 2024
bc35595
fix lint errors
jessieqliu Sep 5, 2024
11da498
commit launchpolicy changes
jessieqliu Sep 5, 2024
537c87e
use configureMonitoringPolicy to process image labels
jessieqliu Sep 9, 2024
57a4434
use original error messages for consistency/testing
jessieqliu Sep 9, 2024
c925837
implement now policy/spec format
jessieqliu Sep 24, 2024
67f52a4
add ALL config
jessieqliu Oct 7, 2024
c7c0906
fix spec tests
jessieqliu Oct 7, 2024
1eb3fc1
fix container runner
jessieqliu Oct 7, 2024
5c22516
fix lint issues
jessieqliu Oct 7, 2024
fdfe013
fix lint issues
jessieqliu Oct 7, 2024
a485857
fix logging for backcompat w tests
jessieqliu Oct 8, 2024
f1fbd72
adjust logs for test backcompat
jessieqliu Oct 9, 2024
3714ca0
adjust mem-monitoring test logs
jessieqliu Oct 10, 2024
e5a8398
fix launchpolicy tests
jessieqliu Oct 10, 2024
176775e
address comments
jessieqliu Oct 14, 2024
e0f1601
capitalize logging string
jessieqliu Oct 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 5 additions & 19 deletions launcher/container_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ import (
"github.com/google/go-tpm-tools/client"
"github.com/google/go-tpm-tools/launcher/agent"
"github.com/google/go-tpm-tools/launcher/internal/signaturediscovery"
"github.com/google/go-tpm-tools/launcher/internal/systemctl"
"github.com/google/go-tpm-tools/launcher/launcherfile"
"github.com/google/go-tpm-tools/launcher/registryauth"
"github.com/google/go-tpm-tools/launcher/spec"
Expand Down Expand Up @@ -115,7 +114,7 @@ func NewRunner(ctx context.Context, cdClient *containerd.Client, token oauth2.To
}

logger.Printf("Image Labels : %v\n", imageConfig.Labels)
launchPolicy, err := spec.GetLaunchPolicy(imageConfig.Labels)
launchPolicy, err := spec.GetLaunchPolicy(imageConfig.Labels, logger)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -341,7 +340,7 @@ func (r *ContainerRunner) measureContainerClaims(ctx context.Context) error {
// eventlog in the AttestationAgent.
func (r *ContainerRunner) measureMemoryMonitor() error {
var enabled uint8
if r.launchSpec.MemoryMonitoringEnabled {
if r.launchSpec.MonitoringEnabled == spec.MemoryOnly {
enabled = 1
}
if err := r.attestAgent.MeasureEvent(cel.CosTlv{EventType: cel.MemoryMonitorType, EventContent: []byte{enabled}}); err != nil {
Expand Down Expand Up @@ -521,22 +520,9 @@ func (r *ContainerRunner) Run(ctx context.Context) error {
go teeServer.Serve()
defer teeServer.Shutdown(ctx)

// start node-problem-detector.service to collect memory related metrics.
if r.launchSpec.MemoryMonitoringEnabled {
r.logger.Println("MemoryMonitoring is enabled by the VM operator")
s, err := systemctl.New()
if err != nil {
return fmt.Errorf("failed to create systemctl client: %v", err)
}
defer s.Close()

r.logger.Println("Starting a systemctl operation: systemctl start node-problem-detector.service")
if err := s.Start("node-problem-detector.service"); err != nil {
return fmt.Errorf("failed to start node-problem-detector.service: %v", err)
}
r.logger.Println("node-problem-detector.service successfully started.")
} else {
r.logger.Println("MemoryMonitoring is disabled by the VM operator")
// Avoids breaking existing memory monitoring tests that depend on this log.
if r.launchSpec.MonitoringEnabled == spec.None {
r.logger.Printf("MemoryMonitoring is disabled by the VM operator")
}

var streamOpt cio.Opt
Expand Down
2 changes: 1 addition & 1 deletion launcher/image/test/test_memory_monitoring.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ steps:
id: CheckMemoryMonitoringEnabled
entrypoint: 'bash'
# Search a regex pattern that ensures memory monitoring is enabled and measured into COS event logs.
args: ['scripts/test_memory_monitoring.sh', '${_VM_NAME_PREFIX}-enable-${BUILD_ID}', '${_ZONE}', 'Successfully measured memory monitoring event.*node-problem-detector.service successfully started']
args: ['scripts/test_memory_monitoring.sh', '${_VM_NAME_PREFIX}-enable-${BUILD_ID}', '${_ZONE}', 'node-problem-detector.service successfully started.*Successfully measured memory monitoring event']
waitFor: ['CreateVMMemoryMemonitorEnabled']
- name: 'gcr.io/cloud-builders/gcloud'
id: CleanUpVMMemoryMonitorEnabled
Expand Down
3 changes: 2 additions & 1 deletion launcher/image/testworkloads/memorymonitoring/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ COPY main /

ENV env_bar="val_bar"

LABEL "tee.launch_policy.monitoring_memory_allow"="always"
LABEL "tee.launch_policy.hardened_monitoring"="MEMORY_ONLY"
LABEL "tee.launch_policy.debug_monitoring"="NONE"

ENTRYPOINT ["/main"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ COPY main /

ENV env_bar="val_bar"

LABEL "tee.launch_policy.monitoring_memory_allow"="debugonly"
LABEL "tee.launch_policy.hardened_monitoring"="NONE"
LABEL "tee.launch_policy.debug_monitoring"="MEMORY_ONLY"

ENTRYPOINT ["/main"]

Expand Down
4 changes: 2 additions & 2 deletions launcher/image/testworkloads/memorymonitoringnever/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ COPY main /

ENV env_bar="val_bar"

LABEL "tee.launch_policy.monitoring_memory_allow"="never"

LABEL "tee.launch_policy.hardened_monitoring"="NONE"
LABEL "tee.launch_policy.debug_monitoring"="NONE"
ENTRYPOINT ["/main"]

CMD ["arg_foo"]
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,101 @@ package nodeproblemdetector
import (
"encoding/json"
"fmt"
"log"
"os"
"time"

"github.com/google/go-tpm-tools/launcher/internal/systemctl"
)

var systemStatsFilePath = "/etc/node_problem_detector/system-stats-monitor.json"

var defaultInvokeIntervalString = (60 * time.Second).String()

type metricConfig struct {
DisplayName string `json:"displayName"`
}

type memoryStatsConfig struct {
type statsConfig struct {
MetricsConfigs map[string]metricConfig `json:"metricsConfigs"`
}

type diskConfig struct {
IncludeAllAttachedBlk bool `json:"includeAllAttachedBlk"`
IncludeRootBlk bool `json:"includeRootBlk"`
LsblkTimeout string `json:"lsblkTimeout"`
MetricsConfigs *statsConfig `json:"metricsConfigs"`
}

// SystemStatsConfig contains configurations for `System Stats Monitor`,
// a problem daemon in node-problem-detector that collects pre-defined health-related metrics from different system components.
// For now we only consider collecting memory related metrics.
jessieqliu marked this conversation as resolved.
Show resolved Hide resolved
// View the comprehensive configuration details on https://github.com/kubernetes/node-problem-detector/tree/master/pkg/systemstatsmonitor#detailed-configuration-options
type SystemStatsConfig struct {
MemoryStatsConfig memoryStatsConfig `json:"memory"`
InvokeInterval string `json:"invokeInterval"`
CPU *statsConfig `json:"cpu,omitempty"`
Disk *diskConfig `json:"disk,omitempty"`
Host *statsConfig `json:"host,omitempty"`
Memory *statsConfig `json:"memory,omitempty"`
InvokeInterval string `json:"invokeInterval,omitempty"`
jessieqliu marked this conversation as resolved.
Show resolved Hide resolved
}

// NewSystemStatsConfig returns a new SystemStatsConfig struct with default configurations.
func NewSystemStatsConfig() SystemStatsConfig {
return SystemStatsConfig{
MemoryStatsConfig: memoryStatsConfig{MetricsConfigs: map[string]metricConfig{}},
InvokeInterval: defaultInvokeIntervalString,
Memory: &statsConfig{MetricsConfigs: map[string]metricConfig{}},
InvokeInterval: defaultInvokeIntervalString,
}
}

var allConfig = &SystemStatsConfig{
CPU: &statsConfig{map[string]metricConfig{
"cpu/runnable_task_count": {"cpu/runnable_task_count"},
"cpu/usage_time": {"cpu/usage_time"},
"cpu/load_1m": {"cpu/load_1m"},
"cpu/load_5m": {"cpu/load_5m"},
"cpu/load_15m": {"cpu/load_15m"},
"system/cpu_stat": {"system/cpu_stat"},
"system/interrupts_total": {"system/interrupts_total"},
"system/processes_total": {"system/processes_total"},
"system/procs_blocked": {"system/procs_blocked"},
"system/procs_running": {"system/procs_running"},
}},
Disk: &diskConfig{
true, true, "5s",
&statsConfig{map[string]metricConfig{
"disk/avg_queue_len": {"disk/avg_queue_len"},
"disk/bytes_used": {"disk/bytes_used"},
"disk/percent_used": {"disk/percent_used"},
"disk/io_time": {"disk/io_time"},
"disk/merged_operation_count": {"disk/merged_operation_count"},
"disk/operation_bytes_count": {"disk/operation_bytes_count"},
"disk/operation_count": {"disk/operation_count"},
"disk/operation_time": {"disk/operation_time"},
"disk/weighted_io": {"disk/weighted_io"},
}},
},
Host: &statsConfig{map[string]metricConfig{
"host/uptime": {"host/uptime"},
}},
Memory: &statsConfig{map[string]metricConfig{
"memory/anonymous_used": {"memory/anonymous_used"},
"memory/bytes_used": {"memory/bytes_used"},
"memory/dirty_used": {"memory/dirty_used"},
"memory/page_cache_used": {"memory/page_cache_used"},
"memory/unevictable_used": {"memory/unevictable_used"},
"memory/percent_used": {"memory/percent_used"},
}},
InvokeInterval: defaultInvokeIntervalString,
}

// EnableAllConfig overwrites system stats config with health monitoring config.
func EnableAllConfig() error {
return allConfig.WriteFile(systemStatsFilePath)
}

// EnableMemoryBytesUsed enables "memory/bytes_used" for memory monitoring.
func (ssc *SystemStatsConfig) EnableMemoryBytesUsed() {
ssc.MemoryStatsConfig.MetricsConfigs["memory/bytes_used"] = metricConfig{DisplayName: "memory/bytes_used"}
ssc.Memory.MetricsConfigs["memory/bytes_used"] = metricConfig{DisplayName: "memory/bytes_used"}
}

// WithInvokeInterval overrides the default invokeInterval.
Expand All @@ -53,3 +114,20 @@ func (ssc *SystemStatsConfig) WriteFile(path string) error {
}
return os.WriteFile(path, bytes, 0644)
}

// StartService starts Node Problem Detector.
func StartService(logger *log.Logger) error {
s, err := systemctl.New()
if err != nil {
return fmt.Errorf("failed to create systemctl client: %v", err)
}
defer s.Close()

logger.Printf("Starting node-problem-detector.service")
if err := s.Start("node-problem-detector.service"); err != nil {
return fmt.Errorf("failed to start node-problem-detector.service")
}

logger.Printf("node-problem-detector.service successfully started")
return nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package nodeproblemdetector

import (
"bytes"
"encoding/json"
"io"
"os"
"path"
Expand All @@ -11,12 +12,38 @@ import (
"github.com/google/go-cmp/cmp"
)

func TestEnableHealthMonitoringConfig(t *testing.T) {
tmpDir := t.TempDir()
systemStatsFilePath = path.Join(tmpDir, "system-stats-monitor.json")

wantBytes, err := json.Marshal(allConfig)
if err != nil {
t.Fatalf("Error marshaling expected config: %v", err)
}

EnableAllConfig()

file, err := os.OpenFile(systemStatsFilePath, os.O_RDONLY, 0)
if err != nil {
t.Fatalf("failed to open file %s: %v", systemStatsFilePath, err)
}

gotBytes, err := io.ReadAll(file)
if err != nil {
t.Fatalf("failed to read from file %s: %v", systemStatsFilePath, err)
}

if !bytes.Equal(gotBytes, wantBytes) {
t.Errorf("WriteFile() did not write expected contents, got %s, want %s", gotBytes, wantBytes)
}
}

func TestEnableMemoryBytesUsed(t *testing.T) {
got := NewSystemStatsConfig()
got.EnableMemoryBytesUsed()

want := SystemStatsConfig{
MemoryStatsConfig: memoryStatsConfig{
Memory: &statsConfig{
MetricsConfigs: map[string]metricConfig{
"memory/bytes_used": {DisplayName: "memory/bytes_used"},
},
Expand Down
21 changes: 21 additions & 0 deletions launcher/launcher/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/containerd/containerd/namespaces"
"github.com/google/go-tpm-tools/client"
"github.com/google/go-tpm-tools/launcher"
"github.com/google/go-tpm-tools/launcher/internal/healthmonitoring/nodeproblemdetector"
"github.com/google/go-tpm-tools/launcher/launcherfile"
"github.com/google/go-tpm-tools/launcher/registryauth"
"github.com/google/go-tpm-tools/launcher/spec"
Expand Down Expand Up @@ -95,6 +96,26 @@ func main() {
return
}

if launchSpec.MonitoringEnabled != spec.None {
logger.Printf("Health Monitoring is enabled by the VM operator")

if launchSpec.MonitoringEnabled == spec.All {
logger.Printf("All health monitoring metrics enabled")
if err := nodeproblemdetector.EnableAllConfig(); err != nil {
logger.Printf("failed to enable full monitoring config: %v", err)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since it's logged, this should be capitalized "Failed to ...

return
}
} else if launchSpec.MonitoringEnabled == spec.MemoryOnly {
logger.Printf("memory/bytes_used enabled")
}

if err := nodeproblemdetector.StartService(logger); err != nil {
logger.Print(err)
}
} else {
logger.Printf("Health Monitoring is disabled")
}

defer func() {
// Catch panic to attempt to output to Cloud Logging.
if r := recover(); r != nil {
Expand Down
Loading
Loading