Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rpk: add rpk debug remote-bundle; collect a cluster-wide bundle #23986

Merged
merged 2 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
rpk: add rpk debug remote bundle
Fixes DEVEX-44

This commit introduces the rpk debug remote bundle
command, which allows the user to request a debug
bundle using the Admin API.
  • Loading branch information
r-vasquez committed Nov 22, 2024
commit b88a85700ac4bb34140d3d8c53f1780d0e74a7f5
1 change: 1 addition & 0 deletions MODULE.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ use_repo(
"com_github_docker_go_connections",
"com_github_docker_go_units",
"com_github_fatih_color",
"com_github_google_uuid",
"com_github_hamba_avro",
"com_github_hamba_avro_v2",
"com_github_hashicorp_go_multierror",
Expand Down
2 changes: 1 addition & 1 deletion src/go/rpk/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ require (
github.com/docker/go-connections v0.5.0
github.com/docker/go-units v0.5.0
github.com/fatih/color v1.18.0
github.com/google/uuid v1.6.0
github.com/hamba/avro/v2 v2.27.0
github.com/hashicorp/go-multierror v1.1.1
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51
Expand Down Expand Up @@ -94,7 +95,6 @@ require (
github.com/google/gnostic-models v0.6.9-0.20230804172637-c7be7c783f49 // indirect
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.1 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
Expand Down
1 change: 1 addition & 0 deletions src/go/rpk/pkg/cli/debug/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ go_library(
visibility = ["//visibility:public"],
deps = [
"//src/go/rpk/pkg/cli/debug/bundle",
"//src/go/rpk/pkg/cli/debug/remotebundle",
"//src/go/rpk/pkg/config",
"@com_github_spf13_afero//:afero",
"@com_github_spf13_cobra//:cobra",
Expand Down
2 changes: 1 addition & 1 deletion src/go/rpk/pkg/cli/debug/bundle/bundle.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ func NewCommand(fs afero.Fs, p *config.Params) *cobra.Command {

f := cmd.Flags()
f.StringVarP(&outFile, outputFlag, "o", "", "The file path where the debug file will be written (default ./<timestamp>-bundle.zip)")
f.DurationVar(&timeout, "timeout", 31*time.Second, "How long to wait for child commands to execute (e.g. 30s, 1.5m)")
f.DurationVar(&timeout, "timeout", 31*time.Second, "How long to wait for child commands to execute. For example: 30s, 1.5m")
f.StringVar(&uploadURL, "upload-url", "", "If provided, where to upload the bundle in addition to creating a copy on disk")
// Debug bundle options.
opts.InstallFlags(f)
Expand Down
9 changes: 8 additions & 1 deletion src/go/rpk/pkg/cli/debug/common/BUILD
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
load("@rules_go//go:def.bzl", "go_library")
load("@rules_go//go:def.bzl", "go_library", "go_test")

go_library(
name = "common",
Expand All @@ -7,3 +7,10 @@ go_library(
visibility = ["//visibility:public"],
deps = ["@com_github_spf13_pflag//:pflag"],
)

go_test(
name = "common_test",
srcs = ["common_test.go"],
embed = [":common"],
deps = ["@com_github_stretchr_testify//require"],
)
18 changes: 9 additions & 9 deletions src/go/rpk/pkg/cli/debug/common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,16 @@ type DebugBundleSharedOptions struct {
// InstallFlags installs the debug bundle flags that fills the debug bundle
// options.
func (o *DebugBundleSharedOptions) InstallFlags(f *pflag.FlagSet) {
f.StringVar(&o.ControllerLogsSizeLimit, "controller-logs-size-limit", "132MB", "The size limit of the controller logs that can be stored in the bundle (e.g. 3MB, 1GiB)")
f.DurationVar(&o.CPUProfilerWait, "cpu-profiler-wait", 30*time.Second, "For how long to collect samples for the CPU profiler (e.g. 30s, 1.5m). Must be higher than 15s")
f.StringVar(&o.LogsSizeLimit, "logs-size-limit", "100MiB", "Read the logs until the given size is reached (e.g. 3MB, 1GiB)")
f.StringVar(&o.LogsSince, "logs-since", "yesterday", "Include logs dated from specified date onward; (journalctl date format: YYYY-MM-DD, 'yesterday', or 'today'). Refer to journalctl documentation for more options")
f.StringVar(&o.LogsUntil, "logs-until", "", "Include logs older than the specified date; (journalctl date format: YYYY-MM-DD, 'yesterday', or 'today'). Refer to journalctl documentation for more options")
f.DurationVar(&o.MetricsInterval, "metrics-interval", 10*time.Second, "Interval between metrics snapshots (e.g. 30s, 1.5m)")
f.StringVar(&o.ControllerLogsSizeLimit, "controller-logs-size-limit", "132MB", "The size limit of the controller logs that can be stored in the bundle. For example: 3MB, 1GiB")
f.DurationVar(&o.CPUProfilerWait, "cpu-profiler-wait", 30*time.Second, "How long to collect samples for the CPU profiler. For example: 30s, 1.5m. Must be higher than 15s")
f.StringVar(&o.LogsSizeLimit, "logs-size-limit", "100MiB", "Read the logs until the given size is reached. For example: 3MB, 1GiB")
f.StringVar(&o.LogsSince, "logs-since", "yesterday", "Include logs dated from specified date onward; (journalctl date format: YYYY-MM-DD, 'yesterday', or 'today'). See the journalctl documentation for more options")
f.StringVar(&o.LogsUntil, "logs-until", "", "Include logs older than the specified date; (journalctl date format: YYYY-MM-DD, 'yesterday', or 'today'). See the journalctl documentation for more options")
f.DurationVar(&o.MetricsInterval, "metrics-interval", 10*time.Second, "Interval between metrics snapshots. For example: 30s, 1.5m")
f.IntVar(&o.MetricsSampleCount, "metrics-samples", 2, "Number of metrics samples to take (at the interval of --metrics-interval). Must be >= 2")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
f.IntVar(&o.MetricsSampleCount, "metrics-samples", 2, "Number of metrics samples to take (at the interval of --metrics-interval). Must be >= 2")
f.IntVar(&o.MetricsSampleCount, "metrics-samples", 2, "Number of metrics samples to take (at the interval of --metrics-interval). Must be >= 2.")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as the above on periods.

f.StringArrayVarP(&o.PartitionFlag, "partition", "p", nil, "Comma-separated partition IDs; when provided, rpk saves extra admin API requests for those partitions. Check help for extended usage")
f.StringVarP(&o.Namespace, "namespace", "n", "redpanda", "The namespace to use to collect the resources from (k8s only)")
f.StringArrayVarP(&o.LabelSelector, "label-selector", "l", []string{"app.kubernetes.io/name=redpanda"}, "Comma-separated label selectors to filter your resources. e.g: <label>=<value>,<label>=<value> (k8s only)")
f.StringArrayVarP(&o.PartitionFlag, "partition", "p", nil, "Comma-separated partition IDs. When provided, rpk saves extra Admin API requests for those partitions. See the help for extended usage")
f.StringVarP(&o.Namespace, "namespace", "n", "redpanda", "The namespace to use to collect the resources from (K8s only)")
f.StringArrayVarP(&o.LabelSelector, "label-selector", "l", []string{"app.kubernetes.io/name=redpanda"}, "Comma-separated label selectors to filter your resources. For example: <label>=<value>,<label>=<value> (K8s only)")
}

// SanitizeName replace any of the following characters with "-": "<", ">", ":",
Expand Down
58 changes: 58 additions & 0 deletions src/go/rpk/pkg/cli/debug/common/common_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package common

import (
"testing"

"github.com/stretchr/testify/require"
)

func TestSanitizeName(t *testing.T) {
tests := []struct {
name string
input string
exp string
}{
{
name: "No forbidden characters",
input: "validName",
exp: "validName",
},
{
name: "Single forbidden character",
input: "invalid:8083",
exp: "invalid-8083",
},
{
name: "Multiple forbidden characters",
input: "name/with|forbidden?chars",
exp: "name-with-forbidden-chars",
},
{
name: "Only forbidden characters",
input: `<>:\"/\\|?*`,
exp: "-----------",
},
{
name: "Empty string",
input: "",
exp: "",
},
{
name: "No change with already sanitized name",
input: "cleanName123",
exp: "cleanName123",
},
{
name: "Name with numbers and special characters",
input: "name123|test*<>",
exp: "name123-test---",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
actual := SanitizeName(tt.input)
require.Equal(t, tt.exp, actual)
})
}
}
2 changes: 2 additions & 0 deletions src/go/rpk/pkg/cli/debug/debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ package debug

import (
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/debug/bundle"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/debug/remotebundle"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/config"
"github.com/spf13/afero"
"github.com/spf13/cobra"
Expand All @@ -25,6 +26,7 @@ func NewCommand(fs afero.Fs, p *config.Params) *cobra.Command {
cmd.AddCommand(
bundle.NewCommand(fs, p),
NewInfoCommand(),
remotebundle.NewCommand(fs, p),
)

return cmd
Expand Down
27 changes: 27 additions & 0 deletions src/go/rpk/pkg/cli/debug/remotebundle/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
load("@rules_go//go:def.bzl", "go_library")

go_library(
name = "remotebundle",
srcs = [
"cancel.go",
"download.go",
"remote.go",
"start.go",
"status.go",
],
importpath = "github.com/redpanda-data/redpanda/src/go/rpk/pkg/cli/debug/remotebundle",
visibility = ["//visibility:public"],
deps = [
"//src/go/rpk/pkg/adminapi",
"//src/go/rpk/pkg/cli/debug/common",
"//src/go/rpk/pkg/config",
"//src/go/rpk/pkg/out",
"@com_github_docker_go_units//:go-units",
"@com_github_google_uuid//:uuid",
"@com_github_redpanda_data_common_go_rpadmin//:rpadmin",
"@com_github_spf13_afero//:afero",
"@com_github_spf13_cobra//:cobra",
"@io_k8s_apimachinery//pkg/labels",
"@org_golang_x_sync//errgroup",
],
)
121 changes: 121 additions & 0 deletions src/go/rpk/pkg/cli/debug/remotebundle/cancel.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
// Copyright 2024 Redpanda Data, Inc.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.md
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0

package remotebundle

import (
"fmt"
"os"
"sync"

"github.com/redpanda-data/redpanda/src/go/rpk/pkg/config"
"github.com/redpanda-data/redpanda/src/go/rpk/pkg/out"
"github.com/spf13/afero"
"github.com/spf13/cobra"
)

type cancelResponse struct {
Broker string
Canceled bool
Error string
}

func newCancelCommand(fs afero.Fs, p *config.Params) *cobra.Command {
var noConfirm bool
var jobID string
cmd := &cobra.Command{
Use: "cancel",
Short: "Cancel a remote bundle execution",
Long: `Cancel a remote bundle execution.

This command cancels the debug collection process in a remote cluster that
you configured in flags, environment variables, or your rpk profile.

Use the flag '--job-id' to only cancel the debug bundle with
the given job ID.

Use the flag '--no-confirm' to avoid the confirmation prompt.
`,
Args: cobra.NoArgs,
Run: func(cmd *cobra.Command, _ []string) {
p, err := p.LoadVirtualProfile(fs)
out.MaybeDie(err, "rpk unable to load config: %v", err)
config.CheckExitCloudAdmin(p)

status, anyErr, _, cache := executeBundleStatus(cmd.Context(), fs, p)
if jobID != "" {
status = filterStatusByJobID(status, jobID)
}
if !noConfirm {
printTextBundleStatus(status, anyErr)
confirmed, err := out.Confirm("Confirm debug bundle cancel from these brokers?")
out.MaybeDie(err, "unable to confirm cancel: %v; if you want to select a single broker, use -X admin.hosts=<brokers,to,cancel>", err)
if !confirmed {
out.Exit("operation canceled; if you want to select a single broker, use -X admin.hosts=<brokers,to,cancel>")
}
}
var (
wg sync.WaitGroup
rwMu sync.RWMutex // read from cache.
mu sync.Mutex // write to status.
response []cancelResponse
anyCancelErr bool
)
updateStatus := func(resp cancelResponse, err error) {
mu.Lock()
defer mu.Unlock()
anyCancelErr = anyCancelErr || err != nil
response = append(response, resp)
}
for _, s := range status {
wg.Add(1)
go func(addr, jobID string) {
defer wg.Done()
rwMu.RLock()
cl := cache[addr]
rwMu.RUnlock()
resp := cancelResponse{Broker: addr}
err := cl.CancelDebugBundleProcess(cmd.Context(), jobID)
if err != nil {
resp.Error = fmt.Sprintf("unable to cancel debug bundle: %s", tryMessageFromErr(err))
updateStatus(resp, err)
return
}
resp.Canceled = true
updateStatus(resp, nil)
}(s.Broker, s.JobID)
}
wg.Wait()
headers := []string{"broker", "canceled"}
if anyCancelErr {
headers = append(headers, "error")
defer os.Exit(1)
}

tw := out.NewTable(headers...)
defer tw.Flush()
for _, row := range response {
tw.PrintStructFields(row)
}
},
}
cmd.Flags().StringVar(&jobID, "job-id", "", "ID of the job to cancel the debug bundle")
cmd.Flags().BoolVar(&noConfirm, "no-confirm", false, "Disable confirmation prompt")
return cmd
}

func filterStatusByJobID(status []statusResponse, jobID string) []statusResponse {
var filtered []statusResponse
for _, s := range status {
if s.JobID == jobID {
filtered = append(filtered, s)
}
}
return filtered
}
Loading