Skip to content

Commit

Permalink
Enable memory monitoring in CS (#391)
Browse files Browse the repository at this point in the history
* Implement memory monitoring

* Adding image tests for memory monitoring
  • Loading branch information
yawangwang authored Nov 22, 2023
1 parent 8519135 commit 38bab91
Show file tree
Hide file tree
Showing 24 changed files with 619 additions and 30 deletions.
14 changes: 14 additions & 0 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,20 @@ steps:
gcloud builds submit --config=test_discover_signatures.yaml --region us-west1 \
--substitutions _IMAGE_NAME=${OUTPUT_IMAGE_PREFIX}-debug-${OUTPUT_IMAGE_SUFFIX},_IMAGE_PROJECT=${PROJECT_ID},_SIGNATURE_REPO=us-docker.pkg.dev/confidential-space-images-dev/cs-cosign-tests/debug
exit
- name: 'gcr.io/cloud-builders/gcloud'
id: MemoryMonitoringTests
waitFor: ['HardenedImageBuild']
env:
- 'OUTPUT_IMAGE_PREFIX=$_OUTPUT_IMAGE_PREFIX'
- 'OUTPUT_IMAGE_SUFFIX=$_OUTPUT_IMAGE_SUFFIX'
- 'PROJECT_ID=$PROJECT_ID'
script: |
#!/usr/bin/env bash
cd launcher/image/test
echo "running memory monitoring tests on ${OUTPUT_IMAGE_PREFIX}-hardened-${OUTPUT_IMAGE_SUFFIX}"
gcloud builds submit --config=test_memory_monitoring.yaml --region us-west1 \
--substitutions _IMAGE_NAME=${OUTPUT_IMAGE_PREFIX}-hardened-${OUTPUT_IMAGE_SUFFIX},_IMAGE_PROJECT=${PROJECT_ID}
exit
options:
pool:
Expand Down
4 changes: 0 additions & 4 deletions cmd/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,6 @@ github.com/coreos/etcd v3.3.13+incompatible/go.mod h1:uF7uidLiAD3TWHmW31ZFd/JWoc
github.com/coreos/go-etcd v2.0.0+incompatible/go.mod h1:Jez6KQU2B/sWsbdaef3ED8NzMklzPG4d5KIOhIy30Tk=
github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk=
github.com/coreos/go-systemd v0.0.0-20180511133405-39ca1b05acc7/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
github.com/coreos/go-systemd/v22 v22.1.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk=
github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/coreos/pkg v0.0.0-20160727233714-3ac0863d7acf/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE=
Expand Down
5 changes: 5 additions & 0 deletions go.work.sum
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,11 @@ cloud.google.com/go/webrisk v1.9.2/go.mod h1:pY9kfDgAqxUpDBOrG4w8deLfhvJmejKB0qd
cloud.google.com/go/websecurityscanner v1.6.2/go.mod h1:7YgjuU5tun7Eg2kpKgGnDuEOXWIrh8x8lWrJT4zfmas=
cloud.google.com/go/workflows v1.12.1/go.mod h1:5A95OhD/edtOhQd/O741NSfIMezNTbCwLM1P1tBRGHM=
github.com/cenkalti/backoff v2.2.1+incompatible h1:tNowT99t7UNflLxfYYSlKYsBpXdEet03Pg2g16Swow4=
github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e h1:Wf6HqHfScWJN9/ZjdUKyjop4mf3Qdd+1TvvltAvM3m8=
github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf h1:iW4rZ826su+pqaw19uhpSCzhj44qo35pNgKFGqzDKkU=
github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4=
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e h1:BWhy2j3IXJhjCbC68FptL43tDKIq8FladmaTs3Xs7Z8=
github.com/census-instrumentation/opencensus-proto v0.4.1/go.mod h1:4T9NM4+4Vw91VeyqjLS6ao50K5bOcLKN6Q42XnYaRYw=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cncf/udpa/go v0.0.0-20220112060539-c52dc94e7fbe/go.mod h1:6pvJx4me5XPnfI9Z40ddWsdw2W/uZgQLFXToKeRcDiI=
Expand Down
19 changes: 19 additions & 0 deletions launcher/container_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/google/go-tpm-tools/client"
"github.com/google/go-tpm-tools/launcher/agent"
"github.com/google/go-tpm-tools/launcher/internal/signaturediscovery"
"github.com/google/go-tpm-tools/launcher/internal/systemctl"
"github.com/google/go-tpm-tools/launcher/launcherfile"
"github.com/google/go-tpm-tools/launcher/spec"
"github.com/google/go-tpm-tools/launcher/teeserver"
Expand Down Expand Up @@ -507,6 +508,24 @@ func (r *ContainerRunner) Run(ctx context.Context) error {
defer teeServer.Shutdown(ctx)
}

// start node-problem-detector.service to collect memory related metrics.
if r.launchSpec.MemoryMonitoringEnabled {
r.logger.Println("MemoryMonitoring is enabled")
s, err := systemctl.New()
if err != nil {
return fmt.Errorf("failed to create systemctl client: %v", err)
}
defer s.Close()

r.logger.Println("Starting a systemctl operation: systemctl start node-problem-detector.service")
if err := s.Start("node-problem-detector.service"); err != nil {
return fmt.Errorf("failed to start node-problem-detector.service: %v", err)
}
r.logger.Println("node-problem-detector.service successfully started.")
} else {
r.logger.Println("MemoryMonitoring is disabled.")
}

var streamOpt cio.Opt
switch r.launchSpec.LogRedirect {
case spec.Nowhere:
Expand Down
2 changes: 2 additions & 0 deletions launcher/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
cloud.google.com/go/confidentialcomputing v1.4.0
github.com/cenkalti/backoff/v4 v4.1.3
github.com/containerd/containerd v1.6.18
github.com/coreos/go-systemd/v22 v22.5.0
github.com/golang-jwt/jwt/v4 v4.4.1
github.com/google/go-cmp v0.6.0
github.com/google/go-tpm v0.9.0
Expand All @@ -31,6 +32,7 @@ require (
github.com/containerd/ttrpc v1.1.0 // indirect
github.com/containerd/typeurl v1.0.2 // indirect
github.com/docker/go-events v0.0.0-20190806004212-e31b211e4f1c // indirect
github.com/godbus/dbus/v5 v5.0.6 // indirect
github.com/gogo/googleapis v1.4.1 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
Expand Down
3 changes: 3 additions & 0 deletions launcher/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,8 @@ github.com/coreos/go-systemd v0.0.0-20190321100706-95778dfbb74e/go.mod h1:F5haX7
github.com/coreos/go-systemd/v22 v22.0.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk=
github.com/coreos/go-systemd/v22 v22.1.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk=
github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs=
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/coreos/pkg v0.0.0-20160727233714-3ac0863d7acf/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f/go.mod h1:E3G3o1h8I7cfcXa63jLwjI0eiQQMgzzUDFVpN/nH/eA=
github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE=
Expand Down Expand Up @@ -457,6 +459,7 @@ github.com/godbus/dbus v0.0.0-20180201030542-885f9cc04c9c/go.mod h1:/YcGZj5zSblf
github.com/godbus/dbus v0.0.0-20190422162347-ade71ed3457e/go.mod h1:bBOAhwG1umN6/6ZUMtDFBMQR8jRg9O75tm9K00oMsK4=
github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/godbus/dbus/v5 v5.0.6 h1:mkgN1ofwASrYnJ5W6U/BxG15eXXXjirgZc7CLqkcaro=
github.com/godbus/dbus/v5 v5.0.6/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/gogo/googleapis v1.1.0/go.mod h1:gf4bu3Q80BeJ6H1S1vYPm8/ELATdvryBaNFGgqEef3s=
github.com/gogo/googleapis v1.2.0/go.mod h1:Njal3psf3qN6dwBtQfUmBZh2ybovJ0tlu3o/AC7HYjU=
Expand Down
8 changes: 8 additions & 0 deletions launcher/image/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@ main() {
# Override default fluent-bit config.
cp /usr/share/oem/confidential_space/fluent-bit-cs.conf /etc/fluent-bit/fluent-bit.conf

# Override default system-stats-monitor.json for node-problem-detector.
cp /usr/share/oem/confidential_space/system-stats-monitor-cs.json /etc/node_problem_detector/system-stats-monitor.json
# Override default boot-disk-size-consistency-monitor.json for node-problem-detector.
cp /usr/share/oem/confidential_space/boot-disk-size-consistency-monitor-cs.json /etc/node_problem_detector/boot-disk-size-consistency-monitor.json
# Override default docker-monitor.json for node-problem-detector.
cp /usr/share/oem/confidential_space/docker-monitor-cs.json /etc/node_problem_detector/docker-monitor.json
# Override default kernel-monitor.json for node-problem-detector.
cp /usr/share/oem/confidential_space/kernel-monitor-cs.json /etc/node_problem_detector/kernel-monitor.json
systemctl daemon-reload
systemctl enable container-runner.service
systemctl start container-runner.service
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"plugin": "custom",
"pluginConfig": {
"invoke_interval": "30m",
"timeout": "7s",
"max_output_length": 80,
"enable_message_change_based_condition_update": false
},
"source": "boot-disk-size-consistency-monitor",
"metricsReporting": false,
"rules": []
}
12 changes: 12 additions & 0 deletions launcher/image/nodeproblemdetector/docker-monitor-cs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"plugin": "journald",
"pluginConfig": {
"source": "dockerd"
},
"logPath": "/var/log/journal",
"lookback": "5m",
"bufferSize": 10,
"source": "docker-monitor",
"metricsReporting": false,
"conditions": []
}
10 changes: 10 additions & 0 deletions launcher/image/nodeproblemdetector/kernel-monitor-cs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"plugin": "kmsg",
"logPath": "/dev/kmsg",
"lookback": "5m",
"bufferSize": 10,
"source": "kernel-monitor",
"metricsReporting": false,
"conditions": [],
"rules": []
}
10 changes: 10 additions & 0 deletions launcher/image/nodeproblemdetector/system-stats-monitor-cs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"memory": {
"metricsConfigs": {
"memory/bytes_used": {
"displayName": "memory/bytes_used"
}
}
},
"invokeInterval": "60s"
}
11 changes: 10 additions & 1 deletion launcher/image/preload.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,22 @@ configure_cloud_logging() {
cp fluent-bit-cs.conf "${CS_PATH}"
}

configure_node_problem_detector() {
# Copy CS-specific node-problem-detector configs to OEM partition.
cp nodeproblemdetector/system-stats-monitor-cs.json "${CS_PATH}"
cp nodeproblemdetector/boot-disk-size-consistency-monitor-cs.json "${CS_PATH}"
cp nodeproblemdetector/docker-monitor-cs.json "${CS_PATH}"
cp nodeproblemdetector/kernel-monitor-cs.json "${CS_PATH}"
}

configure_systemd_units_for_debug() {
configure_cloud_logging
configure_node_problem_detector
}
configure_systemd_units_for_hardened() {
configure_necessary_systemd_units
configure_cloud_logging
configure_node_problem_detector
# Make entrypoint (via cloud-init) the default unit.
set_default_boot_target "cloud-final.service"

Expand All @@ -85,7 +95,6 @@ configure_systemd_units_for_hardened() {
disable_unit "konlet-startup.service"
disable_unit "crash-reporter.service"
disable_unit "device_policy_manager.service"
disable_unit "node-problem-detector.service"
disable_unit "docker-events-collector-fluent-bit.service"
disable_unit "sshd.service"
disable_unit "var-lib-toolbox.mount"
Expand Down
21 changes: 21 additions & 0 deletions launcher/image/test/scripts/test_memory_monitoring_enabled.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash
set -euxo pipefail
source util/read_serial.sh

# This test requires the workload to run and printing
# corresponding messages to the serial console.
SERIAL_OUTPUT=$(read_serial $1 $2)
print_serial=false

if echo $SERIAL_OUTPUT | grep -q 'node-problem-detector.service successfully started'
then
echo "- memory monitoring enabled"
else
echo "FAILED: memory monitoring disabled"
echo 'TEST FAILED.' > /workspace/status.txt
print_serial=true
fi

if $print_serial; then
echo $SERIAL_OUTPUT
fi
2 changes: 1 addition & 1 deletion launcher/image/test/test_launchpolicy_cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ substitutions:
'_METADATA_FILE': 'startup-script=data/echo_startupscript.sh,user-data=data/cloud-init-config.yaml'
'_CLEANUP': 'true'
'_VM_NAME_PREFIX': 'cs-launchpolicy-test'
'_ZONE': 'us-west1-a'
'_ZONE': 'us-east4-a'
'_WORKLOAD_IMAGE_LOG_NEVER': 'us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/launchpolicylognever:latest'
'_WORKLOAD_IMAGE_LOG_DEBUG': 'us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/launchpolicylogdebug:latest'
'_WORKLOAD_IMAGE_ENV': 'us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/basic-test:latest'
Expand Down
42 changes: 42 additions & 0 deletions launcher/image/test/test_memory_monitoring.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
substitutions:
'_IMAGE_NAME': ''
'_IMAGE_PROJECT': ''
'_CLEANUP': 'true'
'_VM_NAME_PREFIX': 'memory-monitoring'
'_ZONE': 'us-east1-b'
'_WORKLOAD_IMAGE': 'us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/memorymonitoring:latest'

steps:
- name: 'gcr.io/cloud-builders/gcloud'
id: CreateVM
entrypoint: 'bash'
env:
- 'BUILD_ID=$BUILD_ID'
args: ['create_vm.sh','-i', '${_IMAGE_NAME}',
'-p', '${_IMAGE_PROJECT}',
'-m', 'tee-image-reference=${_WORKLOAD_IMAGE},tee-monitoring-memory-enable=true',
'-n', '${_VM_NAME_PREFIX}-${BUILD_ID}',
'-z', '${_ZONE}',
]
- name: 'gcr.io/cloud-builders/gcloud'
id: CheckMemoryMonitoringEnabled
entrypoint: 'bash'
args: ['scripts/test_memory_monitoring_enabled.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}', '${_ZONE}']

- name: 'gcr.io/cloud-builders/gcloud'
id: CleanUp
entrypoint: 'bash'
env:
- 'CLEANUP=$_CLEANUP'
args: ['cleanup.sh', '${_VM_NAME_PREFIX}-${BUILD_ID}', '${_ZONE}']
# Must come after cleanup.
- name: 'gcr.io/cloud-builders/gcloud'
id: CheckFailure
entrypoint: 'bash'
env:
- 'BUILD_ID=$BUILD_ID'
args: ['check_failure.sh']

options:
pool:
name: 'projects/confidential-space-images-dev/locations/us-west1/workerPools/cs-image-build-vpc'
14 changes: 14 additions & 0 deletions launcher/image/testworkloads/memorymonitoring/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# From current directory:
# GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o main ../basic
# gcloud builds submit --tag us-west1-docker.pkg.dev/confidential-space-images-dev/cs-integ-test-images/memorymonitoring:latest --project confidential-space-images-dev
FROM alpine

COPY main /

ENV env_bar="val_bar"

LABEL "tee.launch_policy.monitoring_memory_allow"="always"

ENTRYPOINT ["/main"]

CMD ["arg_foo"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Package nodeproblemdetector provides configurations for node-problem-detector.service.
package nodeproblemdetector

import (
"encoding/json"
"fmt"
"os"
"time"
)

var defaultInvokeIntervalString = (60 * time.Second).String()

type metricConfig struct {
DisplayName string `json:"displayName"`
}

type memoryStatsConfig struct {
MetricsConfigs map[string]metricConfig `json:"metricsConfigs"`
}

// SystemStatsConfig contains configurations for `System Stats Monitor`,
// a problem daemon in node-problem-detector that collects pre-defined health-related metrics from different system components.
// For now we only consider collecting memory related metrics.
// View the comprehensive configuration details on https://github.com/kubernetes/node-problem-detector/tree/master/pkg/systemstatsmonitor#detailed-configuration-options
type SystemStatsConfig struct {
MemoryStatsConfig memoryStatsConfig `json:"memory"`
InvokeInterval string `json:"invokeInterval"`
}

// NewSystemStatsConfig returns a new SystemStatsConfig struct with default configurations.
func NewSystemStatsConfig() SystemStatsConfig {
return SystemStatsConfig{
MemoryStatsConfig: memoryStatsConfig{MetricsConfigs: map[string]metricConfig{}},
InvokeInterval: defaultInvokeIntervalString,
}
}

// EnableMemoryBytesUsed enables "memory/bytes_used" for memory monitoring.
func (ssc *SystemStatsConfig) EnableMemoryBytesUsed() {
ssc.MemoryStatsConfig.MetricsConfigs["memory/bytes_used"] = metricConfig{DisplayName: "memory/bytes_used"}
}

// WithInvokeInterval overrides the default invokeInterval.
func (ssc *SystemStatsConfig) WithInvokeInterval(interval time.Duration) {
ssc.InvokeInterval = interval.String()
}

// WriteFile writes systemStatsConfig data to the named file, creating it if necessary.
func (ssc *SystemStatsConfig) WriteFile(path string) error {
bytes, err := json.Marshal(ssc)
if err != nil {
return fmt.Errorf("failed to marshal struct [%v]: %w", ssc, err)
}
return os.WriteFile(path, bytes, 0644)
}
Loading

0 comments on commit 38bab91

Please sign in to comment.