From 0a948740dddb3fc1d3e77ecb0f1f0e8b0e505049 Mon Sep 17 00:00:00 2001 From: Sambhav Jain Date: Wed, 28 Aug 2024 10:56:17 +0530 Subject: [PATCH] roachtest: add code changes to benchmarks to emit openmetrics https://github.com/cockroachdb/cockroach/pull/129221 and https://github.com/cockroachdb/cockroach/pull/132023 added changes to exporters to emit openmetrics. This PR makes changes to the roachtests to make use of the changes in the above PRs. This change also made some changes to some roachtests that use neither of the above approaches Epic: https://cockroachlabs.atlassian.net/browse/CRDB-41852 Release note: None --- pkg/cmd/roachprod-microbench/BUILD.bazel | 1 + pkg/cmd/roachprod-microbench/executor.go | 4 +- .../roachprod-microbench/roachprod_util.go | 46 +++++++++++ pkg/cmd/roachprod-microbench/stage.go | 15 ++-- pkg/cmd/roachprod-microbench/util/BUILD.bazel | 3 - pkg/cmd/roachprod-microbench/util/util.go | 47 +++-------- pkg/cmd/roachtest/clusterstats/BUILD.bazel | 1 + pkg/cmd/roachtest/clusterstats/exporter.go | 52 ++++++++++-- pkg/cmd/roachtest/roachtestutil/BUILD.bazel | 2 + pkg/cmd/roachtest/roachtestutil/utils.go | 70 ++++++++++++++++ ...mission_control_disk_bandwidth_overload.go | 22 +++-- .../tests/admission_control_elastic_io.go | 11 ++- ...admission_control_elastic_mixed_version.go | 14 +++- .../admission_control_multi_store_overload.go | 5 +- .../admission_control_snapshot_overload.go | 8 +- .../admission_control_snapshot_overload_io.go | 10 ++- .../tests/admission_control_tpcc_overload.go | 10 ++- pkg/cmd/roachtest/tests/backup.go | 34 ++++---- pkg/cmd/roachtest/tests/cdc_bench.go | 35 ++++---- pkg/cmd/roachtest/tests/connection_latency.go | 9 +- pkg/cmd/roachtest/tests/decommissionbench.go | 65 +++++++++------ pkg/cmd/roachtest/tests/failover.go | 26 ++++-- pkg/cmd/roachtest/tests/import.go | 30 +++---- pkg/cmd/roachtest/tests/indexes.go | 7 +- pkg/cmd/roachtest/tests/kv.go | 4 +- .../roachtest/tests/large_schema_benchmark.go | 8 +- pkg/cmd/roachtest/tests/ledger.go | 9 +- .../tests/loss_of_quorum_recovery.go | 31 ++++--- .../roachtest/tests/perturbation/framework.go | 33 ++++---- pkg/cmd/roachtest/tests/queue.go | 17 ++-- pkg/cmd/roachtest/tests/restore.go | 30 +++---- .../tests/schemachange_random_load.go | 10 ++- pkg/cmd/roachtest/tests/tpcc.go | 82 ++++++++++++++----- pkg/cmd/roachtest/tests/tpce.go | 43 +++++++++- pkg/cmd/roachtest/tests/tpchbench.go | 10 ++- pkg/cmd/roachtest/tests/ycsb.go | 12 ++- pkg/workload/cli/run.go | 7 +- pkg/workload/histogram/exporter/BUILD.bazel | 1 + .../exporter/openmetrics_exporter.go | 8 +- pkg/workload/histogram/exporter/util.go | 15 +--- 40 files changed, 574 insertions(+), 273 deletions(-) create mode 100644 pkg/cmd/roachprod-microbench/roachprod_util.go diff --git a/pkg/cmd/roachprod-microbench/BUILD.bazel b/pkg/cmd/roachprod-microbench/BUILD.bazel index 9d708f17b9a3..2055ccab6e09 100644 --- a/pkg/cmd/roachprod-microbench/BUILD.bazel +++ b/pkg/cmd/roachprod-microbench/BUILD.bazel @@ -11,6 +11,7 @@ go_library( "main.go", "metadata.go", "report.go", + "roachprod_util.go", "slack.go", "stage.go", ], diff --git a/pkg/cmd/roachprod-microbench/executor.go b/pkg/cmd/roachprod-microbench/executor.go index dd325a557b5a..bba409d31116 100644 --- a/pkg/cmd/roachprod-microbench/executor.go +++ b/pkg/cmd/roachprod-microbench/executor.go @@ -109,7 +109,7 @@ func newExecutor(config executorConfig) (*executor, error) { roachprodConfig.Quiet = config.quiet timestamp := timeutil.Now() - l := util.InitLogger(filepath.Join(config.outputDir, fmt.Sprintf("roachprod-microbench-%s.log", timestamp.Format(util.TimeFormat)))) + l := InitLogger(filepath.Join(config.outputDir, fmt.Sprintf("roachprod-microbench-%s.log", timestamp.Format(util.TimeFormat)))) excludeBenchmarks := util.GetRegexExclusionPairs(config.excludeList) return &executor{ @@ -275,7 +275,7 @@ func (e *executor) executeBenchmarks() error { } // Init `roachprod` and get the number of nodes in the cluster. - util.InitRoachprod() + InitRoachprod() statuses, err := roachprod.Status(context.Background(), e.log, e.cluster, "") if err != nil { return err diff --git a/pkg/cmd/roachprod-microbench/roachprod_util.go b/pkg/cmd/roachprod-microbench/roachprod_util.go new file mode 100644 index 000000000000..af2f4a900695 --- /dev/null +++ b/pkg/cmd/roachprod-microbench/roachprod_util.go @@ -0,0 +1,46 @@ +// Copyright 2024 The Cockroach Authors. +// +// Use of this software is governed by the CockroachDB Software License +// included in the /LICENSE file. + +package main + +import ( + "context" + "fmt" + "os" + + "github.com/cockroachdb/cockroach/pkg/roachprod" + "github.com/cockroachdb/cockroach/pkg/roachprod/install" + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" +) + +// InitRoachprod initializes the roachprod providers by calling InitProviders. +// This function sets up the environment for running roachprod commands. +func InitRoachprod() { + _ = roachprod.InitProviders() +} + +// RoachprodRun runs a command on a roachprod cluster with the given cluster name and logger. +// It takes a list of command arguments and passes them to the roachprod command execution. +func RoachprodRun(clusterName string, l *logger.Logger, cmdArray []string) error { + // Execute the roachprod command with the provided context, logger, cluster name, and options. + return roachprod.Run( + context.Background(), l, clusterName, "", "", false, + os.Stdout, os.Stderr, cmdArray, install.DefaultRunOptions(), + ) +} + +// InitLogger initializes and returns a logger based on the provided log file path. +// If the logger configuration fails, the program prints an error and exits. +func InitLogger(path string) *logger.Logger { + loggerCfg := logger.Config{Stdout: os.Stdout, Stderr: os.Stderr} // Create a logger config with standard output and error. + var loggerError error + l, loggerError := loggerCfg.NewLogger(path) // Create a new logger based on the configuration. + if loggerError != nil { + // If there is an error initializing the logger, print the error message and exit the program. + _, _ = fmt.Fprintf(os.Stderr, "unable to configure logger: %s\n", loggerError) + os.Exit(1) + } + return l // Return the initialized logger. +} diff --git a/pkg/cmd/roachprod-microbench/stage.go b/pkg/cmd/roachprod-microbench/stage.go index e56aaa441377..a0c3cd5cd745 100644 --- a/pkg/cmd/roachprod-microbench/stage.go +++ b/pkg/cmd/roachprod-microbench/stage.go @@ -11,7 +11,6 @@ import ( "path" "strings" - "github.com/cockroachdb/cockroach/pkg/cmd/roachprod-microbench/util" "github.com/cockroachdb/cockroach/pkg/roachprod" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/errors" @@ -22,7 +21,7 @@ import ( func stage(cluster, archivePath, remoteDest string) (err error) { ctx := context.Background() - util.InitRoachprod() + InitRoachprod() loggerCfg := logger.Config{Stdout: os.Stdout, Stderr: os.Stderr} l, err := loggerCfg.NewLogger("") if err != nil { @@ -34,22 +33,22 @@ func stage(cluster, archivePath, remoteDest string) (err error) { defer func() { // Remove the remote archive after we're done. - cleanUpErr := util.RoachprodRun(cluster, l, []string{"rm", "-rf", archiveRemotePath}) + cleanUpErr := RoachprodRun(cluster, l, []string{"rm", "-rf", archiveRemotePath}) err = errors.CombineErrors(err, errors.Wrapf(cleanUpErr, "removing remote archive: %s", archiveRemotePath)) }() // Remove the remote archive and destination directory if they exist. - if err = util.RoachprodRun(cluster, l, []string{"rm", "-rf", archiveRemotePath}); err != nil { + if err = RoachprodRun(cluster, l, []string{"rm", "-rf", archiveRemotePath}); err != nil { return errors.Wrapf(err, "removing remote archive: %s", archiveRemotePath) } - if err = util.RoachprodRun(cluster, l, []string{"rm", "-rf", remoteDest}); err != nil { + if err = RoachprodRun(cluster, l, []string{"rm", "-rf", remoteDest}); err != nil { return errors.Wrapf(err, "removing remote destination: %s", remoteDest) } // Copy the archive to the remote machine. copyFromGCS := strings.HasPrefix(archivePath, "gs://") if copyFromGCS { - if err = util.RoachprodRun(cluster, l, []string{"gsutil", "-q", "-m", "cp", archivePath, archiveRemotePath}); err != nil { + if err = RoachprodRun(cluster, l, []string{"gsutil", "-q", "-m", "cp", archivePath, archiveRemotePath}); err != nil { return errors.Wrapf(err, "copying archive from GCS: %s", archivePath) } } else { @@ -59,10 +58,10 @@ func stage(cluster, archivePath, remoteDest string) (err error) { } // Extract the archive on the remote machine. - if err = util.RoachprodRun(cluster, l, []string{"mkdir", "-p", remoteDest}); err != nil { + if err = RoachprodRun(cluster, l, []string{"mkdir", "-p", remoteDest}); err != nil { return errors.Wrapf(err, "creating remote destination: %s", remoteDest) } - if err = util.RoachprodRun(cluster, l, []string{"tar", "-C", remoteDest, "-xzf", archiveRemotePath}); err != nil { + if err = RoachprodRun(cluster, l, []string{"tar", "-C", remoteDest, "-xzf", archiveRemotePath}); err != nil { return errors.Wrapf(err, "extracting archive: %s", archiveRemotePath) } diff --git a/pkg/cmd/roachprod-microbench/util/BUILD.bazel b/pkg/cmd/roachprod-microbench/util/BUILD.bazel index cf9b1129a9a0..319a7c5651ce 100644 --- a/pkg/cmd/roachprod-microbench/util/BUILD.bazel +++ b/pkg/cmd/roachprod-microbench/util/BUILD.bazel @@ -6,9 +6,6 @@ go_library( importpath = "github.com/cockroachdb/cockroach/pkg/cmd/roachprod-microbench/util", visibility = ["//visibility:public"], deps = [ - "//pkg/roachprod", - "//pkg/roachprod/install", - "//pkg/roachprod/logger", "@com_github_spf13_cobra//:cobra", "@org_golang_x_exp//maps", ], diff --git a/pkg/cmd/roachprod-microbench/util/util.go b/pkg/cmd/roachprod-microbench/util/util.go index 1667159fe6d0..6e3522fe323d 100644 --- a/pkg/cmd/roachprod-microbench/util/util.go +++ b/pkg/cmd/roachprod-microbench/util/util.go @@ -6,16 +6,12 @@ package util import ( - "context" "fmt" "os" "regexp" "sort" "strings" - "github.com/cockroachdb/cockroach/pkg/roachprod" - "github.com/cockroachdb/cockroach/pkg/roachprod/install" - "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/spf13/cobra" "golang.org/x/exp/maps" ) @@ -25,13 +21,15 @@ const TimeFormat = "2006-01-02T15_04_05" const PackageSeparator = "→" var ( - // invalidCharRegex matches + // invalidCharKeyRegex matches // the first character if it is not a letter (a-z, A-Z) or an underscore (_) // or // any character that is not a letter (a-z, A-Z), digit (0-9), or underscore (_). - invalidCharKeyRegex = regexp.MustCompile(`(^[^a-zA-Z_])|([^a-zA-Z0-9_])`) - // invalidCharValueRegex + invalidCharKeyRegex = regexp.MustCompile(`(^[^a-zA-Z_])|([^a-zA-Z0-9_])`) invalidCharValueRegex = regexp.MustCompile(`[\\\n"]`) + + // invalidMetricNameRegex matches any other character other than _, :, characters and digits + invalidMetricNameRegex = regexp.MustCompile(`[^a-zA-Z0-9_:]`) ) // LabelMapToString converts a map of labels (key-value pairs) into a formatted string. @@ -55,6 +53,11 @@ func SanitizeKey(input string) string { return invalidCharKeyRegex.ReplaceAllString(input, "_") } +// SanitizeMetricName replaces all invalid characters in input string with underscores +func SanitizeMetricName(input string) string { + return invalidMetricNameRegex.ReplaceAllString(input, "_") +} + // SanitizeValue replaces all \,\n and " with underscores (_). func SanitizeValue(input string) string { // Replace all characters that match as per the regex with an underscore. @@ -120,33 +123,3 @@ func GetRegexExclusionPairs(excludeList []string) [][]*regexp.Regexp { } return excludeRegexes } - -// InitRoachprod initializes the roachprod providers by calling InitProviders. -// This function sets up the environment for running roachprod commands. -func InitRoachprod() { - _ = roachprod.InitProviders() -} - -// RoachprodRun runs a command on a roachprod cluster with the given cluster name and logger. -// It takes a list of command arguments and passes them to the roachprod command execution. -func RoachprodRun(clusterName string, l *logger.Logger, cmdArray []string) error { - // Execute the roachprod command with the provided context, logger, cluster name, and options. - return roachprod.Run( - context.Background(), l, clusterName, "", "", false, - os.Stdout, os.Stderr, cmdArray, install.DefaultRunOptions(), - ) -} - -// InitLogger initializes and returns a logger based on the provided log file path. -// If the logger configuration fails, the program prints an error and exits. -func InitLogger(path string) *logger.Logger { - loggerCfg := logger.Config{Stdout: os.Stdout, Stderr: os.Stderr} // Create a logger config with standard output and error. - var loggerError error - l, loggerError := loggerCfg.NewLogger(path) // Create a new logger based on the configuration. - if loggerError != nil { - // If there is an error initializing the logger, print the error message and exit the program. - _, _ = fmt.Fprintf(os.Stderr, "unable to configure logger: %s\n", loggerError) - os.Exit(1) - } - return l // Return the initialized logger. -} diff --git a/pkg/cmd/roachtest/clusterstats/BUILD.bazel b/pkg/cmd/roachtest/clusterstats/BUILD.bazel index 73be50eb46d8..b373a1ffe23a 100644 --- a/pkg/cmd/roachtest/clusterstats/BUILD.bazel +++ b/pkg/cmd/roachtest/clusterstats/BUILD.bazel @@ -36,6 +36,7 @@ go_library( "@com_github_prometheus_client_golang//api", "@com_github_prometheus_client_golang//api/prometheus/v1:prometheus", "@com_github_prometheus_common//model", + "@org_golang_x_exp//maps", ], ) diff --git a/pkg/cmd/roachtest/clusterstats/exporter.go b/pkg/cmd/roachtest/clusterstats/exporter.go index 4a8f5fcc95b4..25a029ff9062 100644 --- a/pkg/cmd/roachtest/clusterstats/exporter.go +++ b/pkg/cmd/roachtest/clusterstats/exporter.go @@ -11,6 +11,7 @@ import ( "encoding/json" "fmt" "path/filepath" + "strings" "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachprod-microbench/util" @@ -21,6 +22,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/errors" + "golang.org/x/exp/maps" ) // ClusterStat represents a filtered query by the given LabelName. For example, @@ -128,7 +130,7 @@ func (r *ClusterStatRun) serializeOpenmetricsOutRun( ctx context.Context, t test.Test, c cluster.Cluster, ) error { - labelString := GetDefaultOpenmetricsLabelString(t, c) + labelString := GetOpenmetricsLabelString(t, c, nil) report, err := serializeOpenmetricsReport(*r, &labelString) if err != nil { return errors.Wrap(err, "failed to serialize perf artifacts") @@ -142,13 +144,13 @@ func serializeOpenmetricsReport(r ClusterStatRun, labelString *string) (*bytes.B // Emit summary metrics from Total for key, value := range r.Total { - buffer.WriteString(fmt.Sprintf("# TYPE %s gauge\n", util.SanitizeKey(key))) + buffer.WriteString(fmt.Sprintf("# TYPE %s gauge\n", util.SanitizeMetricName(key))) buffer.WriteString(fmt.Sprintf("%s{%s} %f %d\n", util.SanitizeKey(key), *labelString, value, timeutil.Now().UTC().Unix())) } // Emit histogram metrics from Stats for _, stat := range r.Stats { - buffer.WriteString(fmt.Sprintf("# TYPE %s gauge\n", util.SanitizeKey(stat.Tag))) + buffer.WriteString(fmt.Sprintf("# TYPE %s gauge\n", util.SanitizeMetricName(stat.Tag))) for i, timestamp := range stat.Time { t := timeutil.Unix(0, timestamp) buffer.WriteString( @@ -397,15 +399,49 @@ func (cs *clusterStatCollector) getStatSummary( return ret, nil } -func GetDefaultOpenmetricsLabelString(t test.Test, c cluster.Cluster) string { - return util.LabelMapToString(GetDefaultOpenmetricsLabelMap(t, c)) +// GetOpenmetricsLabelString creates a string that follows the openmetrics labels format +func GetOpenmetricsLabelString(t test.Test, c cluster.Cluster, labels map[string]string) string { + return util.LabelMapToString(GetOpenmetricsLabelMap(t, c, labels)) } -func GetDefaultOpenmetricsLabelMap(t test.Test, c cluster.Cluster) map[string]string { - return map[string]string{ - "test": t.Name(), +// GetOpenmetricsLabelMap creates a map of label keys and values +// It takes roachtest parameters and create relevant labels. +// Test name is split and each split is added as a subtype +func GetOpenmetricsLabelMap( + t test.Test, c cluster.Cluster, labels map[string]string, +) map[string]string { + defaultMap := map[string]string{ "cloud": c.Cloud().String(), "owner": string(t.Spec().(*registry.TestSpec).Owner), "suite": t.Spec().(*registry.TestSpec).Suites.String(), } + + // Since the roachtest have / delimiter for subtests + // Partitioning the name by '/' + testNameArray := strings.Split(t.Name(), "/") + defaultMap["test"] = testNameArray[0] + + subTestIterator := 1 + for i := 1; i < len(testNameArray); i++ { + testSubName := testNameArray[i] + + // If the partition has '=', add the key as a label itself + // and the value as its value + if strings.Contains(testSubName, "=") { + testLabels := strings.Split(testSubName, "=") + defaultMap[testLabels[0]] = testLabels[1] + } else { + + // Add the subtest label with the iterator since there can be nested subtests + testSubType := fmt.Sprintf("subtest-%d", subTestIterator) + subTestIterator++ + defaultMap[testSubType] = testSubName + } + } + + // If the tests has passed some custom labels, copy them to the map created above + if labels != nil { + maps.Copy(defaultMap, labels) + } + return defaultMap } diff --git a/pkg/cmd/roachtest/roachtestutil/BUILD.bazel b/pkg/cmd/roachtest/roachtestutil/BUILD.bazel index 18afad4db7a2..494eea23c4c2 100644 --- a/pkg/cmd/roachtest/roachtestutil/BUILD.bazel +++ b/pkg/cmd/roachtest/roachtestutil/BUILD.bazel @@ -19,6 +19,7 @@ go_library( visibility = ["//visibility:public"], deps = [ "//pkg/cmd/roachtest/cluster", + "//pkg/cmd/roachtest/clusterstats", "//pkg/cmd/roachtest/option", "//pkg/cmd/roachtest/spec", "//pkg/cmd/roachtest/test", @@ -36,6 +37,7 @@ go_library( "//pkg/util/retry", "//pkg/util/syncutil", "//pkg/util/timeutil", + "//pkg/workload/histogram/exporter", "@com_github_cockroachdb_errors//:errors", "@com_github_stretchr_testify//require", ], diff --git a/pkg/cmd/roachtest/roachtestutil/utils.go b/pkg/cmd/roachtest/roachtestutil/utils.go index 0cdf7bfb4f12..f2fd49fa3719 100644 --- a/pkg/cmd/roachtest/roachtestutil/utils.go +++ b/pkg/cmd/roachtest/roachtestutil/utils.go @@ -6,21 +6,25 @@ package roachtestutil import ( + "bytes" "context" "fmt" "io" "net/http" "os" + "path/filepath" "regexp" "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/clusterstats" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/config" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/cockroach/pkg/workload/histogram/exporter" "github.com/cockroachdb/errors" "github.com/stretchr/testify/require" ) @@ -65,6 +69,72 @@ func (e *EveryN) ShouldLog() bool { return e.ShouldProcess(timeutil.Now()) } +// GetWorkloadHistogramArgs creates a histogram flag string based on the roachtest to pass to workload binary +// This is used to make use of t.ExportOpenmetrics() method and create appropriate exporter +func GetWorkloadHistogramArgs(t test.Test, c cluster.Cluster, labels map[string]string) string { + var histogramArgs string + if t.ExportOpenmetrics() { + // Add openmetrics related labels and arguments + histogramArgs = fmt.Sprintf(" --histogram-export-format='openmetrics' --histograms=%s/%s --openmetrics-labels='%s'", + t.PerfArtifactsDir(), GetBenchmarkMetricsFileName(t), clusterstats.GetOpenmetricsLabelString(t, c, labels)) + } else { + // Since default is json, no need to add --histogram-export-format flag in this case and also the labels + histogramArgs = fmt.Sprintf(" --histograms=%s/%s", GetBenchmarkMetricsFileName(t), t.PerfArtifactsDir()) + } + + return histogramArgs +} + +// GetBenchmarkMetricsFileName returns the file name to store the benchmark output +func GetBenchmarkMetricsFileName(t test.Test) string { + if t.ExportOpenmetrics() { + return "stats.om" + } + + return "stats.json" +} + +// CreateWorkloadHistogramExporter creates a exporter.Exporter based on the roachtest parameters +func CreateWorkloadHistogramExporter(t test.Test, c cluster.Cluster) exporter.Exporter { + var metricsExporter exporter.Exporter + if t.ExportOpenmetrics() { + labels := clusterstats.GetOpenmetricsLabelMap(t, c, nil) + openMetricsExporter := &exporter.OpenMetricsExporter{} + openMetricsExporter.SetLabels(&labels) + metricsExporter = openMetricsExporter + + } else { + metricsExporter = &exporter.HdrJsonExporter{} + } + + return metricsExporter +} + +func CreateStatsFileInClusterFromExporter( + ctx context.Context, + t test.Test, + c cluster.Cluster, + perfBuf *bytes.Buffer, + exporter exporter.Exporter, + node option.NodeListOption, +) (string, error) { + if err := exporter.Close(nil); err != nil { + return "", err + } + destinationFileName := GetBenchmarkMetricsFileName(t) + // Upload the perf artifacts to any one of the nodes so that the test + // runner copies it into an appropriate directory path. + dest := filepath.Join(t.PerfArtifactsDir(), destinationFileName) + if err := c.RunE(ctx, option.WithNodes(node), "mkdir -p "+filepath.Dir(dest)); err != nil { + t.L().ErrorfCtx(ctx, "failed to create perf dir: %+v", err) + } + if err := c.PutString(ctx, perfBuf.String(), dest, 0755, node); err != nil { + t.L().ErrorfCtx(ctx, "failed to upload perf artifacts to node: %s", err.Error()) + } + + return destinationFileName, nil +} + // WaitForReady waits until the given nodes report ready via health checks. // This implies that the node has completed server startup, is heartbeating its // liveness record, and can serve SQL clients. diff --git a/pkg/cmd/roachtest/tests/admission_control_disk_bandwidth_overload.go b/pkg/cmd/roachtest/tests/admission_control_disk_bandwidth_overload.go index b1078091ee93..7ffe59c14eec 100644 --- a/pkg/cmd/roachtest/tests/admission_control_disk_bandwidth_overload.go +++ b/pkg/cmd/roachtest/tests/admission_control_disk_bandwidth_overload.go @@ -98,10 +98,16 @@ func registerDiskBandwidthOverload(r registry.Registry) { m.Go(func(ctx context.Context) error { t.Status(fmt.Sprintf("starting foreground kv workload thread (<%s)", time.Minute)) dur := " --duration=" + duration.String() + labels := map[string]string{ + "concurrency": "2", + "splits": "1000", + "read-percent": "50", + } url := fmt.Sprintf(" {pgurl%s}", c.CRDBNodes()) - cmd := "./cockroach workload run kv --histograms=perf/stats.json --concurrency=2 " + - "--splits=1000 --read-percent=50 --min-block-bytes=1024 --max-block-bytes=1024 " + - "--txn-qos='regular' --tolerate-errors" + foregroundDB + dur + url + cmd := fmt.Sprintf("./cockroach workload run kv %s--concurrency=2 "+ + "--splits=1000 --read-percent=50 --min-block-bytes=1024 --max-block-bytes=1024 "+ + "--txn-qos='regular' --tolerate-errors %s %s %s", + roachtestutil.GetWorkloadHistogramArgs(t, c, labels), foregroundDB, dur, url) c.Run(ctx, option.WithNodes(c.WorkloadNode()), cmd) return nil }) @@ -111,9 +117,13 @@ func registerDiskBandwidthOverload(r registry.Registry) { t.Status(fmt.Sprintf("starting background kv workload thread (<%s)", time.Minute)) dur := " --duration=" + duration.String() url := fmt.Sprintf(" {pgurl%s}", c.CRDBNodes()) - cmd := "./cockroach workload run kv --histograms=perf/stats.json --concurrency=1024 " + - "--read-percent=0 --min-block-bytes=4096 --max-block-bytes=4096 " + - "--txn-qos='background' --tolerate-errors" + backgroundDB + dur + url + labels := map[string]string{ + "concurrency": "1024", + "read-percent": "0", + } + cmd := fmt.Sprintf("./cockroach workload run kv %s --concurrency=1024 "+ + "--read-percent=0 --min-block-bytes=4096 --max-block-bytes=4096 "+ + "--txn-qos='background' --tolerate-errors %s %s %s", roachtestutil.GetWorkloadHistogramArgs(t, c, labels), backgroundDB, dur, url) c.Run(ctx, option.WithNodes(c.WorkloadNode()), cmd) return nil }) diff --git a/pkg/cmd/roachtest/tests/admission_control_elastic_io.go b/pkg/cmd/roachtest/tests/admission_control_elastic_io.go index fbf441c319ac..f66aee2e0935 100644 --- a/pkg/cmd/roachtest/tests/admission_control_elastic_io.go +++ b/pkg/cmd/roachtest/tests/admission_control_elastic_io.go @@ -71,12 +71,17 @@ func registerElasticIO(r registry.Registry) { duration := 30 * time.Minute t.Status("running workload") m := c.NewMonitor(ctx, c.CRDBNodes()) + labels := map[string]string{ + "duration": fmt.Sprintf("%d", duration.Milliseconds()), + "concurrency": "512", + } m.Go(func(ctx context.Context) error { dur := " --duration=" + duration.String() url := fmt.Sprintf(" {pgurl%s}", c.CRDBNodes()) - cmd := "./cockroach workload run kv --init --histograms=perf/stats.json --concurrency=512 " + - "--splits=1000 --read-percent=0 --min-block-bytes=65536 --max-block-bytes=65536 " + - "--txn-qos=background --tolerate-errors --secure" + dur + url + cmd := fmt.Sprintf("./cockroach workload run kv --init %s -concurrency=512 "+ + "--splits=1000 --read-percent=0 --min-block-bytes=65536 --max-block-bytes=65536 "+ + "--txn-qos=background --tolerate-errors --secure %s %s", + roachtestutil.GetWorkloadHistogramArgs(t, c, labels), dur, url) c.Run(ctx, option.WithNodes(c.WorkloadNode()), cmd) return nil }) diff --git a/pkg/cmd/roachtest/tests/admission_control_elastic_mixed_version.go b/pkg/cmd/roachtest/tests/admission_control_elastic_mixed_version.go index 975251bafffe..1dec1fa84742 100644 --- a/pkg/cmd/roachtest/tests/admission_control_elastic_mixed_version.go +++ b/pkg/cmd/roachtest/tests/admission_control_elastic_mixed_version.go @@ -81,25 +81,31 @@ func registerElasticWorkloadMixedVersion(r registry.Registry) { "--min-block-bytes=512 --max-block-bytes=1024 {pgurl%s}", binary, c.Node(1))) } + labels := map[string]string{ + "concurrency": "500", + "read-percent": "5", + } // The workloads are tuned to keep the cluster busy at 30-40% CPU, and IO // overload metric approaching 20-30% which causes elastic traffic being // de-prioritized and wait. runForeground := func(ctx context.Context, duration time.Duration) error { cmd := roachtestutil.NewCommand("./cockroach workload run kv "+ - "--histograms=perf/stats.json --concurrency=500 "+ + "%s --concurrency=500 "+ "--max-rate=5000 --read-percent=5 "+ "--min-block-bytes=512 --max-block-bytes=1024 "+ "--txn-qos='regular' "+ - "--duration=%v {pgurl%s}", duration, c.CRDBNodes()) + "--duration=%v {pgurl%s}", roachtestutil.GetWorkloadHistogramArgs(t, c, labels), duration, c.CRDBNodes()) return c.RunE(ctx, option.WithNodes(c.WorkloadNode()), cmd.String()) } + + labels["read-percent"] = "0" runBackground := func(ctx context.Context, duration time.Duration) error { cmd := roachtestutil.NewCommand("./cockroach workload run kv "+ - "--histograms=perf/stats.json --concurrency=500 "+ + "%s --concurrency=500 "+ "--max-rate=10000 --read-percent=0 "+ "--min-block-bytes=2048 --max-block-bytes=4096 "+ "--txn-qos='background' "+ - "--duration=%v {pgurl%s}", duration, c.CRDBNodes()) + "--duration=%v {pgurl%s}", roachtestutil.GetWorkloadHistogramArgs(t, c, labels), duration, c.CRDBNodes()) return c.RunE(ctx, option.WithNodes(c.WorkloadNode()), cmd.String()) } runWorkloads := func(ctx2 context.Context) error { diff --git a/pkg/cmd/roachtest/tests/admission_control_multi_store_overload.go b/pkg/cmd/roachtest/tests/admission_control_multi_store_overload.go index 80c7d382fb93..cf2483e99e03 100644 --- a/pkg/cmd/roachtest/tests/admission_control_multi_store_overload.go +++ b/pkg/cmd/roachtest/tests/admission_control_multi_store_overload.go @@ -54,7 +54,10 @@ func registerMultiStoreOverload(r registry.Registry) { t.Status("running workload") dur := 20 * time.Minute duration := " --duration=" + roachtestutil.IfLocal(c, "10s", dur.String()) - histograms := " --histograms=" + t.PerfArtifactsDir() + "/stats.json" + labels := map[string]string{ + "duration": dur.String(), + } + histograms := " " + roachtestutil.GetWorkloadHistogramArgs(t, c, labels) m1 := c.NewMonitor(ctx, c.CRDBNodes()) m1.Go(func(ctx context.Context) error { dbRegular := " --db=db1" diff --git a/pkg/cmd/roachtest/tests/admission_control_snapshot_overload.go b/pkg/cmd/roachtest/tests/admission_control_snapshot_overload.go index 768cabe9d544..7260d20bd6b6 100644 --- a/pkg/cmd/roachtest/tests/admission_control_snapshot_overload.go +++ b/pkg/cmd/roachtest/tests/admission_control_snapshot_overload.go @@ -142,10 +142,16 @@ func registerSnapshotOverload(r registry.Registry) { m := c.NewMonitor(ctx, c.CRDBNodes()) m.Go(func(ctx context.Context) error { duration := " --duration=" + totalWorkloadDuration.String() - histograms := " --histograms=" + t.PerfArtifactsDir() + "/stats.json" concurrency := roachtestutil.IfLocal(c, " --concurrency=8", " --concurrency=256") maxRate := roachtestutil.IfLocal(c, " --max-rate=100", " --max-rate=12000") splits := roachtestutil.IfLocal(c, " --splits=10", " --splits=100") + + labels := map[string]string{ + "concurrency": concurrency, + "max-rate": maxRate, + "splits": splits, + } + histograms := roachtestutil.GetWorkloadHistogramArgs(t, c, labels) c.Run(ctx, option.WithNodes(c.WorkloadNode()), "./cockroach workload run kv --max-block-bytes=1 --read-percent=95 "+ histograms+duration+concurrency+maxRate+splits+fmt.Sprintf(" {pgurl%s}", c.CRDBNodes()), diff --git a/pkg/cmd/roachtest/tests/admission_control_snapshot_overload_io.go b/pkg/cmd/roachtest/tests/admission_control_snapshot_overload_io.go index 658a2dea174d..9f664be609d4 100644 --- a/pkg/cmd/roachtest/tests/admission_control_snapshot_overload_io.go +++ b/pkg/cmd/roachtest/tests/admission_control_snapshot_overload_io.go @@ -8,6 +8,7 @@ package tests import ( "context" "fmt" + "strconv" "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" @@ -192,12 +193,17 @@ func runAdmissionControlSnapshotOverloadIO( t.Status(fmt.Sprintf("starting kv workload thread (<%s)", time.Minute)) m := c.NewMonitor(ctx, c.CRDBNodes()) m.Go(func(ctx context.Context) error { + + labels := map[string]string{ + "concurrency": "4000", + "read-percent": strconv.Itoa(cfg.readPercent), + } c.Run(ctx, option.WithNodes(c.WorkloadNode()), fmt.Sprintf("./cockroach workload run kv --tolerate-errors "+ - "--splits=1000 --histograms=%s/stats.json --read-percent=%d "+ + "--splits=1000 %s --read-percent=%d "+ "--max-rate=600 --max-block-bytes=%d --min-block-bytes=%d "+ "--concurrency=4000 --duration=%s {pgurl:1-2}", - t.PerfArtifactsDir(), + roachtestutil.GetWorkloadHistogramArgs(t, c, labels), cfg.readPercent, cfg.workloadBlockBytes, cfg.workloadBlockBytes, diff --git a/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go b/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go index 8320f3b0eeca..ab80572aa07f 100644 --- a/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go +++ b/pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go @@ -8,12 +8,14 @@ package tests import ( "context" "fmt" + "strconv" "strings" "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" @@ -57,16 +59,18 @@ func (s tpccOLAPSpec) run(ctx context.Context, t test.Test, c cluster.Cluster) { m := c.NewMonitor(ctx, c.CRDBNodes()) rampDuration := 2 * time.Minute duration := 3 * time.Minute + labels := getTpccLabels(s.Warehouses, rampDuration, duration, map[string]string{"concurrency": strconv.Itoa(s.Concurrency)}) m.Go(func(ctx context.Context) error { t.WorkerStatus("running querybench") cmd := fmt.Sprintf( "./workload run querybench --db tpcc"+ " --tolerate-errors=t"+ " --concurrency=%d"+ - " --query-file %s"+ - " --histograms="+t.PerfArtifactsDir()+"/stats.json "+ + " --query-file %s "+ + " %s"+ " --ramp=%s --duration=%s {pgurl:1-%d}", - s.Concurrency, queryFileName, rampDuration, duration, c.Spec().NodeCount-1) + s.Concurrency, queryFileName, roachtestutil.GetWorkloadHistogramArgs(t, c, labels), + rampDuration, duration, c.Spec().NodeCount-1) c.Run(ctx, option.WithNodes(c.WorkloadNode()), cmd) return nil }) diff --git a/pkg/cmd/roachtest/tests/backup.go b/pkg/cmd/roachtest/tests/backup.go index e2176dc589ad..7abe08189218 100644 --- a/pkg/cmd/roachtest/tests/backup.go +++ b/pkg/cmd/roachtest/tests/backup.go @@ -10,12 +10,11 @@ import ( "context" gosql "database/sql" "encoding/base64" - "encoding/json" "fmt" + "io" "net" "net/url" "os" - "path/filepath" "strings" "time" @@ -39,6 +38,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/retry" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/workload/histogram" + "github.com/cockroachdb/cockroach/pkg/workload/histogram/exporter" "github.com/cockroachdb/errors" "github.com/stretchr/testify/require" ) @@ -286,21 +286,28 @@ func fingerprint(ctx context.Context, conn *gosql.DB, db, table string) (string, // initBulkJobPerfArtifacts registers a histogram, creates a performance // artifact directory and returns a method that when invoked records a tick. -func initBulkJobPerfArtifacts(testName string, timeout time.Duration) (func(), *bytes.Buffer) { +func initBulkJobPerfArtifacts( + timeout time.Duration, t test.Test, e exporter.Exporter, +) (func(), *bytes.Buffer) { // Register a named histogram to track the total time the bulk job took. // Roachperf uses this information to display information about this // roachtest. - reg := histogram.NewRegistry( + + reg := histogram.NewRegistryWithExporter( timeout, histogram.MockWorkloadName, + e, ) - reg.GetHandle().Get(testName) + reg.GetHandle().Get(t.Name()) bytesBuf := bytes.NewBuffer([]byte{}) - jsonEnc := json.NewEncoder(bytesBuf) + writer := io.Writer(bytesBuf) + + e.Init(&writer) + tick := func() { reg.Tick(func(tick histogram.Tick) { - _ = jsonEnc.Encode(tick.Snapshot()) + _ = tick.Exporter.SnapshotAndWrite(tick.Hist, tick.Now, tick.Elapsed, &tick.Name) }) } @@ -328,7 +335,8 @@ func registerBackup(r registry.Registry) { rows = 100 } dest := importBankData(ctx, rows, t, c) - tick, perfBuf := initBulkJobPerfArtifacts("backup/2TB", 2*time.Hour) + exporter := roachtestutil.CreateWorkloadHistogramExporter(t, c) + tick, perfBuf := initBulkJobPerfArtifacts(2*time.Hour, t, exporter) m := c.NewMonitor(ctx) m.Go(func(ctx context.Context) error { @@ -349,14 +357,8 @@ func registerBackup(r registry.Registry) { } tick() - // Upload the perf artifacts to any one of the nodes so that the test - // runner copies it into an appropriate directory path. - dest := filepath.Join(t.PerfArtifactsDir(), "stats.json") - if err := c.RunE(ctx, option.WithNodes(c.Node(1)), "mkdir -p "+filepath.Dir(dest)); err != nil { - t.L().ErrorfCtx(ctx, "failed to create perf dir: %+v", err) - } - if err := c.PutString(ctx, perfBuf.String(), dest, 0755, c.Node(1)); err != nil { - t.L().ErrorfCtx(ctx, "failed to upload perf artifacts to node: %s", err.Error()) + if _, err := roachtestutil.CreateStatsFileInClusterFromExporter(ctx, t, c, perfBuf, exporter, c.Node(1)); err != nil { + return err } return nil }) diff --git a/pkg/cmd/roachtest/tests/cdc_bench.go b/pkg/cmd/roachtest/tests/cdc_bench.go index 5246267d0901..b050b685a72c 100644 --- a/pkg/cmd/roachtest/tests/cdc_bench.go +++ b/pkg/cmd/roachtest/tests/cdc_bench.go @@ -9,9 +9,8 @@ import ( "bytes" "context" gosql "database/sql" - "encoding/json" "fmt" - "path/filepath" + "io" "strconv" "sync/atomic" "time" @@ -346,7 +345,7 @@ func runCDCBenchScan( t.L().Printf("changefeed completed in %s (scanned %s rows per second)", duration.Truncate(time.Second), humanize.Comma(rate)) - // Record scan rate to stats.json. + // Record scan rate to stats file. return writeCDCBenchStats(ctx, t, c, nCoord, "scan-rate", rate) }) @@ -547,10 +546,17 @@ func runCDCBenchWorkload( extra += ` --tolerate-errors` } t.L().Printf("running workload") + labels := map[string]string{ + "duration": duration.String(), + "concurrency": fmt.Sprintf("%d", concurrency), + "read_percent": fmt.Sprintf("%d", readPercent), + "insert_count": fmt.Sprintf("%d", insertCount), + } + err := c.RunE(ctx, option.WithNodes(nWorkload), fmt.Sprintf( - `./cockroach workload run kv --seed %d --histograms=%s/stats.json `+ + `./cockroach workload run kv --seed %d %s `+ `--concurrency %d --duration %s --write-seq R%d --read-percent %d %s {pgurl:%d-%d}`, - workloadSeed, t.PerfArtifactsDir(), concurrency, duration, insertCount, readPercent, extra, + workloadSeed, roachtestutil.GetWorkloadHistogramArgs(t, c, labels), concurrency, duration, insertCount, readPercent, extra, nData[0], nData[len(nData)-1])) if err != nil { return err @@ -627,7 +633,7 @@ func waitForChangefeed( } } -// writeCDCBenchStats writes a single perf metric into stats.json on the +// writeCDCBenchStats writes a single perf metric into stats file on the // given node, for graphing in roachperf. func writeCDCBenchStats( ctx context.Context, @@ -640,25 +646,24 @@ func writeCDCBenchStats( // The easiest way to record a precise metric for roachperf is to cast it as a // duration in seconds in the histogram's upper bound. valueS := time.Duration(value) * time.Second - reg := histogram.NewRegistry(valueS, histogram.MockWorkloadName) + + exporter := roachtestutil.CreateWorkloadHistogramExporter(t, c) + reg := histogram.NewRegistryWithExporter(valueS, histogram.MockWorkloadName, exporter) + bytesBuf := bytes.NewBuffer([]byte{}) - jsonEnc := json.NewEncoder(bytesBuf) + writer := io.Writer(bytesBuf) + exporter.Init(&writer) var err error reg.GetHandle().Get(metric).Record(valueS) reg.Tick(func(tick histogram.Tick) { - err = jsonEnc.Encode(tick.Snapshot()) + err = tick.Exporter.SnapshotAndWrite(tick.Hist, tick.Now, tick.Elapsed, &tick.Name) }) if err != nil { return err } - // Upload the perf artifacts to the given node. - path := filepath.Join(t.PerfArtifactsDir(), "stats.json") - if err := c.RunE(ctx, option.WithNodes(node), "mkdir -p "+filepath.Dir(path)); err != nil { - return err - } - if err := c.PutString(ctx, bytesBuf.String(), path, 0755, node); err != nil { + if _, err = roachtestutil.CreateStatsFileInClusterFromExporter(ctx, t, c, bytesBuf, exporter, node); err != nil { return err } return nil diff --git a/pkg/cmd/roachtest/tests/connection_latency.go b/pkg/cmd/roachtest/tests/connection_latency.go index 11395c0dc674..99cb84a3b4a0 100644 --- a/pkg/cmd/roachtest/tests/connection_latency.go +++ b/pkg/cmd/roachtest/tests/connection_latency.go @@ -71,10 +71,15 @@ func runConnectionLatencyTest( t.L().Printf("running workload in %q against urls:\n%s", locality, strings.Join(urls, "\n")) + labels := map[string]string{ + "duration": "30000", + "locality": locality, + } + workloadCmd := fmt.Sprintf( - `./workload run connectionlatency %s --secure --duration 30s --histograms=%s/stats.json --locality %s`, + `./workload run connectionlatency %s --secure --duration 30s %s --locality %s`, urlString, - t.PerfArtifactsDir(), + roachtestutil.GetWorkloadHistogramArgs(t, c, labels), locality, ) err = c.RunE(ctx, option.WithNodes(loadNode), workloadCmd) diff --git a/pkg/cmd/roachtest/tests/decommissionbench.go b/pkg/cmd/roachtest/tests/decommissionbench.go index f3fc6da6f90f..d26b4a7c26b9 100644 --- a/pkg/cmd/roachtest/tests/decommissionbench.go +++ b/pkg/cmd/roachtest/tests/decommissionbench.go @@ -9,8 +9,8 @@ import ( "bytes" "context" gosql "database/sql" - "encoding/json" "fmt" + "io" "math" "os" "path/filepath" @@ -30,6 +30,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/workload/histogram" + "github.com/cockroachdb/cockroach/pkg/workload/histogram/exporter" "github.com/cockroachdb/errors" "github.com/stretchr/testify/require" ) @@ -335,18 +336,28 @@ func fireAfter(ctx context.Context, duration time.Duration, fn func()) { // rather than utilizing the values recorded in the histogram, and can be // recorded in the perfBuf by utilizing the returned tickByName(name) function. func createDecommissionBenchPerfArtifacts( - opNames ...string, -) (reg *histogram.Registry, tickByName func(name string), perfBuf *bytes.Buffer) { + t test.Test, c cluster.Cluster, opNames ...string, +) ( + reg *histogram.Registry, + tickByName func(name string), + perfBuf *bytes.Buffer, + exporter exporter.Exporter, +) { + + exporter = roachtestutil.CreateWorkloadHistogramExporter(t, c) + // Create a histogram registry for recording multiple decommission metrics, // following the "bulk job" form of measuring performance. // See runDecommissionBench for more explanation. - reg = histogram.NewRegistry( + reg = histogram.NewRegistryWithExporter( defaultTimeout, histogram.MockWorkloadName, + exporter, ) perfBuf = bytes.NewBuffer([]byte{}) - jsonEnc := json.NewEncoder(perfBuf) + writer := io.Writer(perfBuf) + exporter.Init(&writer) registeredOpNames := make(map[string]struct{}) for _, opName := range opNames { @@ -357,12 +368,12 @@ func createDecommissionBenchPerfArtifacts( tickByName = func(name string) { reg.Tick(func(tick histogram.Tick) { if _, ok := registeredOpNames[name]; ok && tick.Name == name { - _ = jsonEnc.Encode(tick.Snapshot()) + _ = tick.Exporter.SnapshotAndWrite(tick.Hist, tick.Now, tick.Elapsed, &tick.Name) } }) } - return reg, tickByName, perfBuf + return reg, tickByName, perfBuf, exporter } // setupDecommissionBench performs the initial cluster setup needed prior to @@ -488,23 +499,24 @@ func uploadPerfArtifacts( benchSpec decommissionBenchSpec, pinnedNode, workloadNode int, perfBuf *bytes.Buffer, + exporter exporter.Exporter, ) { // Store the perf artifacts on the pinned node so that the test // runner copies it into an appropriate directory path. - dest := filepath.Join(t.PerfArtifactsDir(), "stats.json") - if err := c.RunE(ctx, option.WithNodes(c.Node(pinnedNode)), "mkdir -p "+filepath.Dir(dest)); err != nil { - t.L().Errorf("failed to create perf dir: %+v", err) - } - if err := c.PutString(ctx, perfBuf.String(), dest, 0755, c.Node(pinnedNode)); err != nil { - t.L().Errorf("failed to upload perf artifacts to node: %s", err.Error()) + + destFileName, err := roachtestutil.CreateStatsFileInClusterFromExporter(ctx, t, c, perfBuf, exporter, c.Node(pinnedNode)) + if err != nil { + t.L().Errorf("error creating perf stats file: %s", err) + return } + dest := filepath.Join(t.PerfArtifactsDir(), destFileName) // Get the workload perf artifacts and move them to the pinned node, so that // they can be used to display the workload operation rates during decommission. if !benchSpec.noLoad { - workloadStatsSrc := filepath.Join(t.PerfArtifactsDir(), "stats.json") - localWorkloadStatsPath := filepath.Join(t.ArtifactsDir(), "workload_stats.json") - workloadStatsDest := filepath.Join(t.PerfArtifactsDir(), "workload_stats.json") + workloadStatsSrc := filepath.Join(t.PerfArtifactsDir(), destFileName) + localWorkloadStatsPath := filepath.Join(t.ArtifactsDir(), "workload_"+destFileName) + workloadStatsDest := filepath.Join(t.PerfArtifactsDir(), "workload_"+destFileName) if err := c.Get( ctx, t.L(), workloadStatsSrc, localWorkloadStatsPath, c.Node(workloadNode), ); err != nil { @@ -608,8 +620,8 @@ func runDecommissionBench( benchSpec.warehouses, ) workloadCmd := fmt.Sprintf("./cockroach workload run tpcc --warehouses=%d --max-rate=%d --duration=%s "+ - "--histograms=%s/stats.json --ramp=%s --tolerate-errors {pgurl:1-%d}", maxRate, benchSpec.warehouses, - testTimeout, t.PerfArtifactsDir(), rampDuration, benchSpec.nodes) + "%s --ramp=%s --tolerate-errors {pgurl:1-%d}", maxRate, benchSpec.warehouses, + testTimeout, roachtestutil.GetWorkloadHistogramArgs(t, c, nil), rampDuration, benchSpec.nodes) // In the case that we want to simulate high read amplification, we use kv0 // to run a write-heavy workload known to be difficult for compactions to keep @@ -617,8 +629,8 @@ func runDecommissionBench( if benchSpec.slowWrites { workloadCmd = fmt.Sprintf("./cockroach workload run kv --init --concurrency=%d --splits=1000 "+ "--read-percent=50 --min-block-bytes=8192 --max-block-bytes=8192 --duration=%s "+ - "--histograms=%s/stats.json --ramp=%s --tolerate-errors {pgurl:1-%d}", benchSpec.nodes*64, - testTimeout, t.PerfArtifactsDir(), rampDuration, benchSpec.nodes) + "%s --ramp=%s --tolerate-errors {pgurl:1-%d}", benchSpec.nodes*64, + testTimeout, roachtestutil.GetWorkloadHistogramArgs(t, c, nil), rampDuration, benchSpec.nodes) } setupDecommissionBench(ctx, t, c, benchSpec, pinnedNode, importCmd) @@ -656,7 +668,8 @@ func runDecommissionBench( // long-running metrics measured by the elapsed time between each tick, // as opposed to the histograms of workload operation latencies or other // recorded values that are typically output in a "tick" each second. - reg, tickByName, perfBuf := createDecommissionBenchPerfArtifacts( + reg, tickByName, perfBuf, exporter := createDecommissionBenchPerfArtifacts( + t, c, decommissionMetric, estimatedMetric, bytesUsedMetric, @@ -722,7 +735,7 @@ func runDecommissionBench( t.Fatal(err) } - uploadPerfArtifacts(ctx, t, c, benchSpec, pinnedNode, workloadNode, perfBuf) + uploadPerfArtifacts(ctx, t, c, benchSpec, pinnedNode, workloadNode, perfBuf, exporter) } // runDecommissionBenchLong initializes a cluster with TPCC and attempts to @@ -753,8 +766,8 @@ func runDecommissionBenchLong( benchSpec.warehouses, ) workloadCmd := fmt.Sprintf("./cockroach workload run tpcc --warehouses=%d --max-rate=%d --duration=%s "+ - "--histograms=%s/stats.json --ramp=%s --tolerate-errors {pgurl:1-%d}", maxRate, benchSpec.warehouses, - testTimeout, t.PerfArtifactsDir(), rampDuration, benchSpec.nodes) + "%s --ramp=%s --tolerate-errors {pgurl:1-%d}", maxRate, benchSpec.warehouses, + testTimeout, roachtestutil.GetWorkloadHistogramArgs(t, c, nil), rampDuration, benchSpec.nodes) setupDecommissionBench(ctx, t, c, benchSpec, pinnedNode, importCmd) @@ -790,7 +803,7 @@ func runDecommissionBenchLong( // long-running metrics measured by the elapsed time between each tick, // as opposed to the histograms of workload operation latencies or other // recorded values that are typically output in a "tick" each second. - reg, tickByName, perfBuf := createDecommissionBenchPerfArtifacts( + reg, tickByName, perfBuf, exporter := createDecommissionBenchPerfArtifacts(t, c, decommissionMetric, upreplicateMetric, bytesUsedMetric, ) @@ -848,7 +861,7 @@ func runDecommissionBenchLong( t.Fatal(err) } - uploadPerfArtifacts(ctx, t, c, benchSpec, pinnedNode, workloadNode, perfBuf) + uploadPerfArtifacts(ctx, t, c, benchSpec, pinnedNode, workloadNode, perfBuf, exporter) } // runSingleDecommission picks a random node and attempts to decommission that diff --git a/pkg/cmd/roachtest/tests/failover.go b/pkg/cmd/roachtest/tests/failover.go index 11b0170e7d60..b64b13ac3241 100644 --- a/pkg/cmd/roachtest/tests/failover.go +++ b/pkg/cmd/roachtest/tests/failover.go @@ -282,11 +282,11 @@ func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readO if readOnly { readPercent = 100 } + err := c.RunE(ctx, option.WithNodes(c.WorkloadNode()), fmt.Sprintf( `./cockroach workload run kv --read-percent %d --write-seq R%d `+ - `--concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+ - `--histograms=`+t.PerfArtifactsDir()+`/stats.json {pgurl:1-2}`, - readPercent, insertCount)) + `--concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors %s {pgurl:1-2}`, readPercent, insertCount, + roachtestutil.GetWorkloadHistogramArgs(t, c, getKVLabels(256, insertCount, readPercent)))) if ctx.Err() != nil { return nil // test requested workload shutdown } @@ -460,7 +460,7 @@ func runFailoverPartialLeaseGateway(ctx context.Context, t test.Test, c cluster. cancelWorkload := m.GoWithCancel(func(ctx context.Context) error { err := c.RunE(ctx, option.WithNodes(c.WorkloadNode()), `./cockroach workload run kv --read-percent 50 `+ `--concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+ - `--histograms=`+t.PerfArtifactsDir()+`/stats.json {pgurl:6-7}`) + roachtestutil.GetWorkloadHistogramArgs(t, c, getKVLabels(256, 0, 50))+` {pgurl:6-7}`) if ctx.Err() != nil { return nil // test requested workload shutdown } @@ -603,7 +603,7 @@ func runFailoverPartialLeaseLeader(ctx context.Context, t test.Test, c cluster.C cancelWorkload := m.GoWithCancel(func(ctx context.Context) error { err := c.RunE(ctx, option.WithNodes(c.WorkloadNode()), `./cockroach workload run kv --read-percent 50 `+ `--concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+ - `--histograms=`+t.PerfArtifactsDir()+`/stats.json {pgurl:1-3}`) + roachtestutil.GetWorkloadHistogramArgs(t, c, getKVLabels(256, 0, 50))+` {pgurl:1-3}`) if ctx.Err() != nil { return nil // test requested workload shutdown } @@ -725,7 +725,7 @@ func runFailoverPartialLeaseLiveness(ctx context.Context, t test.Test, c cluster cancelWorkload := m.GoWithCancel(func(ctx context.Context) error { err := c.RunE(ctx, option.WithNodes(c.WorkloadNode()), `./cockroach workload run kv --read-percent 50 `+ `--concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+ - `--histograms=`+t.PerfArtifactsDir()+`/stats.json {pgurl:1-3}`) + roachtestutil.GetWorkloadHistogramArgs(t, c, getKVLabels(256, 0, 50))+` {pgurl:1-3}`) if ctx.Err() != nil { return nil // test requested workload shutdown } @@ -836,7 +836,7 @@ func runFailoverNonSystem( cancelWorkload := m.GoWithCancel(func(ctx context.Context) error { err := c.RunE(ctx, option.WithNodes(c.WorkloadNode()), `./cockroach workload run kv --read-percent 50 `+ `--concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+ - `--histograms=`+t.PerfArtifactsDir()+`/stats.json {pgurl:1-3}`) + roachtestutil.GetWorkloadHistogramArgs(t, c, getKVLabels(256, 0, 50))+` {pgurl:1-3}`) if ctx.Err() != nil { return nil // test requested workload shutdown } @@ -949,7 +949,7 @@ func runFailoverLiveness( cancelWorkload := m.GoWithCancel(func(ctx context.Context) error { err := c.RunE(ctx, option.WithNodes(c.WorkloadNode()), `./cockroach workload run kv --read-percent 50 `+ `--concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+ - `--histograms=`+t.PerfArtifactsDir()+`/stats.json {pgurl:1-3}`) + roachtestutil.GetWorkloadHistogramArgs(t, c, getKVLabels(256, 0, 50))+` {pgurl:1-3}`) if ctx.Err() != nil { return nil // test requested workload shutdown } @@ -1061,7 +1061,7 @@ func runFailoverSystemNonLiveness( cancelWorkload := m.GoWithCancel(func(ctx context.Context) error { err := c.RunE(ctx, option.WithNodes(c.WorkloadNode()), `./cockroach workload run kv --read-percent 50 `+ `--concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+ - `--histograms=`+t.PerfArtifactsDir()+`/stats.json {pgurl:1-3}`) + roachtestutil.GetWorkloadHistogramArgs(t, c, getKVLabels(256, 0, 50))+` {pgurl:1-3}`) if ctx.Err() != nil { return nil // test requested workload shutdown } @@ -1828,3 +1828,11 @@ func failoverRestartOpts() option.StartOpts { startOpts.RoachprodOpts.SkipInit = true return startOpts } + +func getKVLabels(concurrency int, insertCount int, readPercent int) map[string]string { + return map[string]string{ + "concurrency": fmt.Sprintf("%d", concurrency), + "insert_count": fmt.Sprintf("%d", insertCount), + "read_percent": fmt.Sprintf("%d", readPercent), + } +} diff --git a/pkg/cmd/roachtest/tests/import.go b/pkg/cmd/roachtest/tests/import.go index ad98b1310ffb..21ac4dc0fe15 100644 --- a/pkg/cmd/roachtest/tests/import.go +++ b/pkg/cmd/roachtest/tests/import.go @@ -9,7 +9,6 @@ import ( "context" gosql "database/sql" "fmt" - "path/filepath" "strings" "time" @@ -137,7 +136,8 @@ func registerImportTPCC(r registry.Registry) { m.Go(hc.Runner) } - tick, perfBuf := initBulkJobPerfArtifacts(testName, timeout) + exporter := roachtestutil.CreateWorkloadHistogramExporter(t, c) + tick, perfBuf := initBulkJobPerfArtifacts(timeout, t, exporter) workloadStr := `./cockroach workload fixtures import tpcc --warehouses=%d --csv-server='http://localhost:8081' {pgurl:1}` m.Go(func(ctx context.Context) error { defer dul.Done() @@ -148,7 +148,7 @@ func registerImportTPCC(r registry.Registry) { } else { defer hc.Done() } - cmd := fmt.Sprintf(workloadStr, warehouses) + cmd := fmt.Sprintf(workloadStr, 1) // Tick once before starting the import, and once after to capture the // total elapsed time. This is used by roachperf to compute and display // the average MB/sec per node. @@ -156,14 +156,8 @@ func registerImportTPCC(r registry.Registry) { c.Run(ctx, option.WithNodes(c.Node(1)), cmd) tick() - // Upload the perf artifacts to any one of the nodes so that the test - // runner copies it into an appropriate directory path. - dest := filepath.Join(t.PerfArtifactsDir(), "stats.json") - if err := c.RunE(ctx, option.WithNodes(c.Node(1)), "mkdir -p "+filepath.Dir(dest)); err != nil { - t.L().ErrorfCtx(ctx, "failed to create perf dir: %+v", err) - } - if err := c.PutString(ctx, perfBuf.String(), dest, 0755, c.Node(1)); err != nil { - t.L().ErrorfCtx(ctx, "failed to upload perf artifacts to node: %s", err.Error()) + if _, err := roachtestutil.CreateStatsFileInClusterFromExporter(ctx, t, c, perfBuf, exporter, c.Node(1)); err != nil { + return err } return nil }) @@ -238,7 +232,9 @@ func registerImportTPCH(r registry.Registry) { EncryptionSupport: registry.EncryptionMetamorphic, Leases: registry.MetamorphicLeases, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - tick, perfBuf := initBulkJobPerfArtifacts(t.Name(), item.timeout) + exporter := roachtestutil.CreateWorkloadHistogramExporter(t, c) + + tick, perfBuf := initBulkJobPerfArtifacts(item.timeout, t, exporter) c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings()) conn := c.Conn(ctx, t.L(), 1) @@ -323,14 +319,8 @@ func registerImportTPCH(r registry.Registry) { } tick() - // Upload the perf artifacts to any one of the nodes so that the test - // runner copies it into an appropriate directory path. - dest := filepath.Join(t.PerfArtifactsDir(), "stats.json") - if err := c.RunE(ctx, option.WithNodes(c.Node(1)), "mkdir -p "+filepath.Dir(dest)); err != nil { - t.L().ErrorfCtx(ctx, "failed to create perf dir: %+v", err) - } - if err := c.PutString(ctx, perfBuf.String(), dest, 0755, c.Node(1)); err != nil { - t.L().ErrorfCtx(ctx, "failed to upload perf artifacts to node: %s", err.Error()) + if _, err = roachtestutil.CreateStatsFileInClusterFromExporter(ctx, t, c, perfBuf, exporter, c.Node(1)); err != nil { + return err } return nil }) diff --git a/pkg/cmd/roachtest/tests/indexes.go b/pkg/cmd/roachtest/tests/indexes.go index 967a43d95095..94422ff29b19 100644 --- a/pkg/cmd/roachtest/tests/indexes.go +++ b/pkg/cmd/roachtest/tests/indexes.go @@ -130,8 +130,11 @@ func registerNIndexes(r registry.Registry, secondaryIndexes int) { payload := " --payload=64" concurrency := roachtestutil.IfLocal(c, "", " --concurrency="+strconv.Itoa(conc)) duration := " --duration=" + roachtestutil.IfLocal(c, "10s", "10m") - runCmd := fmt.Sprintf("./workload run indexes --histograms="+t.PerfArtifactsDir()+"/stats.json"+ - payload+concurrency+duration+" {pgurl%s}", gatewayNodes) + labels := map[string]string{ + "concurrency": fmt.Sprintf("%d", conc), + "parallel_writes": fmt.Sprintf("%d", parallelWrites), + } + runCmd := fmt.Sprintf("./workload run indexes %s %s %s %s {pgurl%s}", roachtestutil.GetWorkloadHistogramArgs(t, c, labels), payload, concurrency, duration, gatewayNodes) c.Run(ctx, option.WithNodes(c.WorkloadNode()), runCmd) return nil }) diff --git a/pkg/cmd/roachtest/tests/kv.go b/pkg/cmd/roachtest/tests/kv.go index 6297e877a9aa..ab320d400702 100644 --- a/pkg/cmd/roachtest/tests/kv.go +++ b/pkg/cmd/roachtest/tests/kv.go @@ -147,7 +147,7 @@ func registerKV(r registry.Registry) { } else { readPercent = fmt.Sprintf(" --read-percent=%d", opts.readPercent) } - histograms := " --histograms=" + t.PerfArtifactsDir() + "/stats.json" + var batchSize string if opts.batchSize > 0 { batchSize = fmt.Sprintf(" --batch=%d", opts.batchSize) @@ -171,6 +171,8 @@ func registerKV(r registry.Registry) { if opts.sharedProcessMT { url = fmt.Sprintf(" {pgurl:1-%d:%s}", nodes, appTenantName) } + + histograms := " " + roachtestutil.GetWorkloadHistogramArgs(t, c, nil) cmd := fmt.Sprintf( "./cockroach workload run kv --tolerate-errors --init --user=%s --password=%s", install.DefaultUser, install.DefaultPassword, ) + diff --git a/pkg/cmd/roachtest/tests/large_schema_benchmark.go b/pkg/cmd/roachtest/tests/large_schema_benchmark.go index 94e8e45d536e..110d8fc51e52 100644 --- a/pkg/cmd/roachtest/tests/large_schema_benchmark.go +++ b/pkg/cmd/roachtest/tests/large_schema_benchmark.go @@ -196,14 +196,11 @@ func registerLargeSchemaBenchmark(r registry.Registry, numTables int, isMultiReg populateFileName := fmt.Sprintf("populate_%d", dbListType) mon.Go(func(ctx context.Context) error { waitEnabled := "--wait 0.0" - // Export histograms out for the roach perf dashboard - histograms := " --histograms=" + t.PerfArtifactsDir() + "/stats.json" var wlInstance []workloadInstance // Inactive databases will intentionally have wait time on // them and not include them in our histograms. if dbListType == inactiveDbListType { waitEnabled = "--wait 1.0" - histograms = "" // Use a different prometheus port for the inactive databases, // this will not be measured. wlInstance = append( @@ -224,7 +221,7 @@ func registerLargeSchemaBenchmark(r registry.Registry, numTables int, isMultiReg WorkloadInstances: wlInstance, Duration: time.Minute * 60, ExtraRunArgs: fmt.Sprintf("--db-list-file=%s --txn-preamble-file=%s --admin-urls=%q "+ - "--console-api-file=apiCalls --console-api-username=%q --console-api-password=%q --conns=%d --workers=%d %s %s", + "--console-api-file=apiCalls --console-api-username=%q --console-api-password=%q --conns=%d --workers=%d %s", populateFileName, "ormQueries.sql", strings.Join(webConsoleURLs, ","), @@ -232,8 +229,7 @@ func registerLargeSchemaBenchmark(r registry.Registry, numTables int, isMultiReg "roacher", numWorkers, numWorkers, - waitEnabled, - histograms), + waitEnabled), } runTPCC(ctx, t, t.L(), c, options) return nil diff --git a/pkg/cmd/roachtest/tests/ledger.go b/pkg/cmd/roachtest/tests/ledger.go index b118d21b026d..1851087e65b3 100644 --- a/pkg/cmd/roachtest/tests/ledger.go +++ b/pkg/cmd/roachtest/tests/ledger.go @@ -43,9 +43,14 @@ func registerLedger(r registry.Registry) { concurrency := roachtestutil.IfLocal(c, "", " --concurrency="+fmt.Sprint(nodes*32)) duration := " --duration=" + roachtestutil.IfLocal(c, "10s", "10m") + labels := map[string]string{ + "concurrency": fmt.Sprint(nodes * 32), + "duration": roachtestutil.IfLocal(c, "10000", "600000"), + } + // See https://github.com/cockroachdb/cockroach/issues/94062 for the --data-loader. - cmd := fmt.Sprintf("./workload run ledger --init --data-loader=INSERT --histograms="+t.PerfArtifactsDir()+"/stats.json"+ - concurrency+duration+" {pgurl%s}", gatewayNodes) + cmd := fmt.Sprintf("./workload run ledger --init --data-loader=INSERT %s %s %s {pgurl%s}", + roachtestutil.GetWorkloadHistogramArgs(t, c, labels), concurrency, duration, gatewayNodes) c.Run(ctx, option.WithNodes(c.WorkloadNode()), cmd) return nil }) diff --git a/pkg/cmd/roachtest/tests/loss_of_quorum_recovery.go b/pkg/cmd/roachtest/tests/loss_of_quorum_recovery.go index f28789066b4a..e4991936974d 100644 --- a/pkg/cmd/roachtest/tests/loss_of_quorum_recovery.go +++ b/pkg/cmd/roachtest/tests/loss_of_quorum_recovery.go @@ -9,8 +9,8 @@ import ( "bytes" "context" gosql "database/sql" - "encoding/json" "fmt" + "io" "path" "path/filepath" "time" @@ -343,8 +343,10 @@ func runRecoverLossOfQuorum(ctx context.Context, t test.Test, c cluster.Cluster, } } - recordOutcome, buffer := initPerfCapture() - recordOutcome(testOutcome) + recordOutcome, buffer := initPerfCapture(t, c) + if err := recordOutcome(testOutcome); err != nil { + t.L().Errorf("failed to record outcome: %s", err) + } buffer.upload(ctx, t, c) if testOutcome == success { @@ -569,8 +571,10 @@ func runHalfOnlineRecoverLossOfQuorum( } } - recordOutcome, buffer := initPerfCapture() - recordOutcome(testOutcome) + recordOutcome, buffer := initPerfCapture(t, c) + if err = recordOutcome(testOutcome); err != nil { + t.L().Errorf("failed to record outcome: %s", err) + } buffer.upload(ctx, t, c) if testOutcome == success { @@ -619,7 +623,7 @@ type perfArtifact bytes.Buffer func (p *perfArtifact) upload(ctx context.Context, t test.Test, c cluster.Cluster) { // Upload the perf artifacts to any one of the nodes so that the test // runner copies it into an appropriate directory path. - dest := filepath.Join(t.PerfArtifactsDir(), "stats.json") + dest := filepath.Join(t.PerfArtifactsDir(), roachtestutil.GetBenchmarkMetricsFileName(t)) if err := c.RunE(ctx, option.WithNodes(c.Node(1)), "mkdir -p "+filepath.Dir(dest)); err != nil { t.L().Errorf("failed to create perf dir: %+v", err) } @@ -631,20 +635,25 @@ func (p *perfArtifact) upload(ctx context.Context, t test.Test, c cluster.Cluste // Register histogram and create a function that would record test outcome value. // Returned buffer contains all recorded ticks for the test and is updated // every time metric function is called. -func initPerfCapture() (func(testOutcomeMetric), *perfArtifact) { - reg := histogram.NewRegistry(time.Second*time.Duration(success), histogram.MockWorkloadName) +func initPerfCapture( + t test.Test, c cluster.Cluster, +) (func(testOutcomeMetric) error, *perfArtifact) { + exporter := roachtestutil.CreateWorkloadHistogramExporter(t, c) + reg := histogram.NewRegistryWithExporter(time.Second*time.Duration(success), histogram.MockWorkloadName, exporter) bytesBuf := bytes.NewBuffer([]byte{}) - jsonEnc := json.NewEncoder(bytesBuf) + writer := io.Writer(bytesBuf) + exporter.Init(&writer) writeSnapshot := func() { reg.Tick(func(tick histogram.Tick) { - _ = jsonEnc.Encode(tick.Snapshot()) + _ = tick.Exporter.SnapshotAndWrite(tick.Hist, tick.Now, tick.Elapsed, &tick.Name) }) } - recordOutcome := func(metric testOutcomeMetric) { + recordOutcome := func(metric testOutcomeMetric) error { reg.GetHandle().Get("recovery_result").Record(time.Duration(metric) * time.Second) writeSnapshot() + return exporter.Close(nil) } // Capture start time for the test. diff --git a/pkg/cmd/roachtest/tests/perturbation/framework.go b/pkg/cmd/roachtest/tests/perturbation/framework.go index e7c5bbe4cb37..40bcfa8c4ce4 100644 --- a/pkg/cmd/roachtest/tests/perturbation/framework.go +++ b/pkg/cmd/roachtest/tests/perturbation/framework.go @@ -9,11 +9,10 @@ import ( "bytes" "context" gosql "database/sql" - "encoding/json" "fmt" + "io" "math" "math/rand" - "path/filepath" "reflect" "sort" "strings" @@ -483,7 +482,7 @@ func (v variations) runTest(ctx context.Context, t test.Test, c cluster.Cluster) t.Status("T5: validating results") require.NoError(t, roachtestutil.DownloadProfiles(ctx, c, t.L(), t.ArtifactsDir())) - require.NoError(t, v.writePerfArtifacts(ctx, t.Name(), t.PerfArtifactsDir(), baselineStats, perturbationStats, + require.NoError(t, v.writePerfArtifacts(ctx, t, c, baselineStats, perturbationStats, afterStats)) t.L().Printf("validating stats during the perturbation") @@ -682,40 +681,42 @@ func sortedStringKeys(m map[string]trackedStat) []string { return keys } -// writePerfArtifacts writes the stats.json in the right format to node 1 so it +// writePerfArtifacts writes the stats file in the right format to node 1 so it // can be picked up by roachperf. Currently it only writes the write stats since // there would be too many lines on the graph otherwise. func (v variations) writePerfArtifacts( ctx context.Context, - name string, - perfDir string, + t test.Test, + c cluster.Cluster, baseline, perturbation, recovery map[string]trackedStat, ) error { - reg := histogram.NewRegistry( + + exporter := roachtestutil.CreateWorkloadHistogramExporter(t, c) + + reg := histogram.NewRegistryWithExporter( time.Second, histogram.MockWorkloadName, + exporter, ) + + bytesBuf := bytes.NewBuffer([]byte{}) + writer := io.Writer(bytesBuf) + exporter.Init(&writer) + reg.GetHandle().Get("baseline").Record(baseline["write"].score) reg.GetHandle().Get("perturbation").Record(perturbation["write"].score) reg.GetHandle().Get("recovery").Record(recovery["write"].score) - bytesBuf := bytes.NewBuffer([]byte{}) - jsonEnc := json.NewEncoder(bytesBuf) var err error reg.Tick(func(tick histogram.Tick) { - err = jsonEnc.Encode(tick.Snapshot()) + err = tick.Exporter.SnapshotAndWrite(tick.Hist, tick.Now, tick.Elapsed, &tick.Name) }) if err != nil { return err } node := v.Node(1) - // Upload the perf artifacts to the given node. - if err := v.RunE(ctx, option.WithNodes(node), "mkdir -p "+perfDir); err != nil { - return err - } - path := filepath.Join(perfDir, "stats.json") - if err := v.PutString(ctx, bytesBuf.String(), path, 0755, node); err != nil { + if _, err := roachtestutil.CreateStatsFileInClusterFromExporter(ctx, t, c, bytesBuf, exporter, node); err != nil { return err } return nil diff --git a/pkg/cmd/roachtest/tests/queue.go b/pkg/cmd/roachtest/tests/queue.go index 930094e49d04..3f64b1b1287c 100644 --- a/pkg/cmd/roachtest/tests/queue.go +++ b/pkg/cmd/roachtest/tests/queue.go @@ -54,13 +54,18 @@ func runQueue(ctx context.Context, t test.Test, c cluster.Cluster) { if initTables { init = " --init" } + labels := map[string]string{ + "batch": "100", + "concurrency": roachtestutil.IfLocal(c, "", fmt.Sprint(dbNodeCount*64)), + "duration": duration, + } cmd := fmt.Sprintf( - "./workload run queue --histograms="+t.PerfArtifactsDir()+"/stats.json"+ - init+ - concurrency+ - duration+ - batch+ - " {pgurl%s}", + "./workload run queue %s %s %s %s %s {pgurl%s}", + roachtestutil.GetWorkloadHistogramArgs(t, c, labels), + init, + concurrency, + duration, + batch, c.CRDBNodes(), ) c.Run(ctx, option.WithNodes(c.WorkloadNode()), cmd) diff --git a/pkg/cmd/roachtest/tests/restore.go b/pkg/cmd/roachtest/tests/restore.go index 0c4daa4f1ece..dba8578f4e95 100644 --- a/pkg/cmd/roachtest/tests/restore.go +++ b/pkg/cmd/roachtest/tests/restore.go @@ -8,11 +8,10 @@ package tests import ( "bytes" "context" - "encoding/json" "fmt" + "io" "math/rand" "net/http" - "path/filepath" "strings" "time" @@ -1124,30 +1123,31 @@ func exportToRoachperf( ctx context.Context, t test.Test, c cluster.Cluster, testName string, metric int64, ) { + exporter := roachtestutil.CreateWorkloadHistogramExporter(t, c) + // The easiest way to record a precise metric for roachperf is to caste it as a duration, // in seconds in the histogram's upper bound. - reg := histogram.NewRegistry( - time.Duration(metric)*time.Second, - histogram.MockWorkloadName, - ) + reg := histogram.NewRegistryWithExporter(time.Duration(metric)*time.Second, histogram.MockWorkloadName, exporter) + bytesBuf := bytes.NewBuffer([]byte{}) - jsonEnc := json.NewEncoder(bytesBuf) + writer := io.Writer(bytesBuf) + exporter.Init(&writer) + var err error // Ensure the histogram contains the name of the roachtest reg.GetHandle().Get(testName) // Serialize the histogram into the buffer reg.Tick(func(tick histogram.Tick) { - _ = jsonEnc.Encode(tick.Snapshot()) + err = tick.Exporter.SnapshotAndWrite(tick.Hist, tick.Now, tick.Elapsed, &tick.Name) }) - // Upload the perf artifacts to any one of the nodes so that the test - // runner copies it into an appropriate directory path. - dest := filepath.Join(t.PerfArtifactsDir(), "stats.json") - if err := c.RunE(ctx, option.WithNodes(c.Node(1)), "mkdir -p "+filepath.Dir(dest)); err != nil { - t.L().ErrorfCtx(ctx, "failed to create perf dir: %+v", err) + + if err != nil { + return } - if err := c.PutString(ctx, bytesBuf.String(), dest, 0755, c.Node(1)); err != nil { - t.L().ErrorfCtx(ctx, "failed to upload perf artifacts to node: %s", err.Error()) + if _, err = roachtestutil.CreateStatsFileInClusterFromExporter(ctx, t, c, bytesBuf, exporter, c.Node(1)); err != nil { + t.L().Errorf("error creating stats file: %s", err) + return } } diff --git a/pkg/cmd/roachtest/tests/schemachange_random_load.go b/pkg/cmd/roachtest/tests/schemachange_random_load.go index c784e0e9506b..71616f43e8e5 100644 --- a/pkg/cmd/roachtest/tests/schemachange_random_load.go +++ b/pkg/cmd/roachtest/tests/schemachange_random_load.go @@ -14,6 +14,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" @@ -109,13 +110,18 @@ func runSchemaChangeRandomLoad( runCmd := []string{ "./workload run schemachange --verbose=1", "--tolerate-errors=false", - // Save the histograms so that they can be reported to https://roachperf.crdb.dev/. - " --histograms=" + t.PerfArtifactsDir() + "/stats.json", fmt.Sprintf("--max-ops %d", maxOps), fmt.Sprintf("--concurrency %d", concurrency), fmt.Sprintf("--txn-log %s", filepath.Join(storeDirectory, txnLogFile)), fmt.Sprintf("{pgurl%s}", loadNode), } + + extraLabels := map[string]string{ + "concurrency": fmt.Sprintf("%d", concurrency), + "max-ops": fmt.Sprintf("%d", maxOps), + } + + runCmd = append(runCmd, roachtestutil.GetWorkloadHistogramArgs(t, c, extraLabels)) t.Status("running schemachange workload") err = c.RunE(ctx, option.WithNodes(loadNode), runCmd...) if err != nil { diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go index 49980a2572a3..802806ce4912 100644 --- a/pkg/cmd/roachtest/tests/tpcc.go +++ b/pkg/cmd/roachtest/tests/tpcc.go @@ -9,6 +9,7 @@ import ( "context" gosql "database/sql" "fmt" + "maps" "math" "math/rand" "os" @@ -17,6 +18,7 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/clusterstats" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil" @@ -279,7 +281,7 @@ func runTPCC( // Make a copy of i for the goroutine. i := i m.Go(func(ctx context.Context) error { - // Only prefix stats.json with workload_i_ if we have multiple workloads, + // Only prefix stats file with workload_i_ if we have multiple workloads, // in case other processes relied on previous behavior. var statsPrefix string if len(workloadInstances) > 1 { @@ -288,11 +290,18 @@ func runTPCC( l.Printf("running tpcc worker=%d warehouses=%d ramp=%s duration=%s on %s (<%s)", i, opts.Warehouses, rampDur, opts.Duration, pgURLs[i], time.Minute) - histogramsPath := fmt.Sprintf("%s/%sstats.json", t.PerfArtifactsDir(), statsPrefix) + fileName := roachtestutil.GetBenchmarkMetricsFileName(t) + histogramsPath := fmt.Sprintf("%s/%s%s", t.PerfArtifactsDir(), statsPrefix, fileName) + var labelsMap map[string]string + if t.ExportOpenmetrics() { + labelsMap = getTpccLabels(opts.Warehouses, rampDur, opts.Duration, map[string]string{"database": opts.DB}) + } cmd := roachtestutil.NewCommand("%s workload run %s", test.DefaultCockroachPath, opts.getWorkloadCmd()). MaybeFlag(opts.DB != "", "db", opts.DB). Flag("warehouses", opts.Warehouses). MaybeFlag(!opts.DisableHistogram, "histograms", histogramsPath). + MaybeFlag(t.ExportOpenmetrics(), "histogram-export-format", "openmetrics"). + MaybeFlag(t.ExportOpenmetrics(), "openmetrics-labels", clusterstats.GetOpenmetricsLabelString(t, c, labelsMap)). Flag("ramp", rampDur). Flag("duration", opts.Duration). Flag("prometheus-port", workloadInstances[i].prometheusPort). @@ -484,11 +493,18 @@ func runTPCCMixedHeadroom(ctx context.Context, t test.Test, c cluster.Cluster) { workloadDur = 100 * time.Minute } } + histogramsPath := fmt.Sprintf("%s/%s", t.PerfArtifactsDir(), roachtestutil.GetBenchmarkMetricsFileName(t)) + var labelsMap map[string]string + if t.ExportOpenmetrics() { + labelsMap = getTpccLabels(headroomWarehouses, rampDur, workloadDur/time.Millisecond, nil) + } cmd := roachtestutil.NewCommand("./cockroach workload run tpcc"). Arg("{pgurl%s}", c.CRDBNodes()). Flag("duration", workloadDur). Flag("warehouses", headroomWarehouses). - Flag("histograms", t.PerfArtifactsDir()+"/stats.json"). + Flag("histograms", histogramsPath). + MaybeFlag(t.ExportOpenmetrics(), "histogram-export-format", "openmetrics"). + MaybeFlag(t.ExportOpenmetrics(), "openmetrics-labels", clusterstats.GetOpenmetricsLabelString(t, c, labelsMap)). Flag("ramp", rampDur). Flag("prometheus-port", 2112). Flag("pprofport", workloadPProfStartPort). @@ -1667,15 +1683,18 @@ func runTPCCBench(ctx context.Context, t test.Test, c cluster.Cluster, b tpccBen extraFlags += " --method=simple" } t.Status(fmt.Sprintf("running benchmark, warehouses=%d", warehouses)) - histogramsPath := fmt.Sprintf("%s/warehouses=%d/stats.json", t.PerfArtifactsDir(), warehouses) + histogramsPath := fmt.Sprintf("%s/warehouses=%d/%s", t.PerfArtifactsDir(), warehouses, roachtestutil.GetBenchmarkMetricsFileName(t)) var tenantSuffix string if b.SharedProcessMT { tenantSuffix = fmt.Sprintf(":%s", appTenantName) } + + labels := getTpccLabels(warehouses, rampDur, loadDur, nil) + cmd := fmt.Sprintf("./cockroach workload run tpcc --warehouses=%d --active-warehouses=%d "+ - "--tolerate-errors --ramp=%s --duration=%s%s --histograms=%s {pgurl%s%s}", + "--tolerate-errors --ramp=%s --duration=%s%s %s {pgurl%s%s}", b.LoadWarehouses(c.Cloud()), warehouses, rampDur, - loadDur, extraFlags, histogramsPath, sqlGateways, tenantSuffix) + loadDur, extraFlags, roachtestutil.GetWorkloadHistogramArgs(t, c, labels), sqlGateways, tenantSuffix) err := c.RunE(ctx, option.WithNodes(group.LoadNodes), cmd) loadDone <- timeutil.Now() if err != nil { @@ -1683,23 +1702,26 @@ func runTPCCBench(ctx context.Context, t test.Test, c cluster.Cluster, b tpccBen // count. return errors.Wrapf(err, "error running tpcc load generator") } - roachtestHistogramsPath := filepath.Join(resultsDir, fmt.Sprintf("%d.%d-stats.json", warehouses, groupIdx)) - if err := c.Get( - ctx, t.L(), histogramsPath, roachtestHistogramsPath, group.LoadNodes, - ); err != nil { - // NB: this will let the line search continue. The reason we do this - // is because it's conceivable that we made it here, but a VM just - // froze up on us. The next search iteration will handle this state. - return err - } - snapshots, err := histogram.DecodeSnapshots(roachtestHistogramsPath) - if err != nil { - // If we got this far, and can't decode data, it's not a case of - // overload but something that deserves failing the whole test. - t.Fatal(err) + if !t.ExportOpenmetrics() { + roachtestHistogramsPath := filepath.Join(resultsDir, fmt.Sprintf("%d.%d-stats.json", warehouses, groupIdx)) + if err := c.Get( + ctx, t.L(), histogramsPath, roachtestHistogramsPath, group.LoadNodes, + ); err != nil { + // NB: this will let the line search continue. The reason we do this + // is because it's conceivable that we made it here, but a VM just + // froze up on us. The next search iteration will handle this state. + return err + } + snapshots, err := histogram.DecodeSnapshots(roachtestHistogramsPath) + if err != nil { + // If we got this far, and can't decode data, it's not a case of + // overload but something that deserves failing the whole test. + t.Fatal(err) + } + result := tpcc.NewResultWithSnapshots(warehouses, 0, snapshots) + resultChan <- result + return nil } - result := tpcc.NewResultWithSnapshots(warehouses, 0, snapshots) - resultChan <- result return nil }) } @@ -1832,3 +1854,19 @@ func setupPrometheusForRoachtest( } return cfg, cleanupFunc } + +func getTpccLabels( + warehouses int, rampDur time.Duration, duration time.Duration, extraLabels map[string]string, +) map[string]string { + labels := map[string]string{ + "warehouses": fmt.Sprintf("%d", warehouses), + "duration": duration.String(), + "ramp": rampDur.String(), + } + + if extraLabels != nil { + maps.Copy(labels, extraLabels) + } + + return labels +} diff --git a/pkg/cmd/roachtest/tests/tpce.go b/pkg/cmd/roachtest/tests/tpce.go index 6984127cec0c..9bc485bc700b 100644 --- a/pkg/cmd/roachtest/tests/tpce.go +++ b/pkg/cmd/roachtest/tests/tpce.go @@ -7,6 +7,7 @@ package tests import ( "bufio" + "bytes" "context" "encoding/json" "fmt" @@ -15,6 +16,7 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/clusterstats" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil" @@ -23,6 +25,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/roachprod/prometheus" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/errors" "github.com/stretchr/testify/require" ) @@ -254,7 +257,7 @@ func runTPCE(ctx context.Context, t test.Test, c cluster.Cluster, opts tpceOptio } if opts.exportMetrics { - return exportTPCEResults(t, result.Stdout) + return exportTPCEResults(t, c, result.Stdout) } return nil }) @@ -327,7 +330,7 @@ type tpceMetrics struct { // // The Measured Throughput is computed as the total number of Valid Trade-Result Transactions // within the Measurement Interval divided by the duration of the Measurement Interval in seconds. -func exportTPCEResults(t test.Test, result string) error { +func exportTPCEResults(t test.Test, c cluster.Cluster, result string) error { // Filter out everything but the TradeResult transaction metrics. // // Example output of TPCE: @@ -357,7 +360,20 @@ func exportTPCEResults(t test.Test, result string) error { P99Latency: removeUnits(fields[11]), PMaxLatency: removeUnits(fields[12]), } - metricBytes, err := json.Marshal(metrics) + + var metricBytes []byte + var err error + fileName := roachtestutil.GetBenchmarkMetricsFileName(t) + if t.ExportOpenmetrics() { + labels := map[string]string{ + "workload": "tpce", + } + labelString := clusterstats.GetOpenmetricsLabelString(t, c, labels) + metricBytes = getOpenMetrics(metrics, fields[5], labelString) + } else { + metricBytes, err = json.Marshal(metrics) + } + if err != nil { return err } @@ -369,7 +385,26 @@ func exportTPCEResults(t test.Test, result string) error { return err } - return os.WriteFile(fmt.Sprintf("%s/stats.json", perfDir), metricBytes, 0666) + return os.WriteFile(fmt.Sprintf("%s/%s", perfDir, fileName), metricBytes, 0666) } return errors.Errorf("exportTPCEResults: found no lines starting with TradeResult") } + +func getOpenMetrics(metrics tpceMetrics, countOfLatencies string, labelString string) []byte { + + var buffer bytes.Buffer + now := timeutil.Now().Unix() + + buffer.WriteString("# TYPE tpce_latency summary\n") + buffer.WriteString("# HELP tpce_latency Latency metrics for TPC-E transactions\n") + buffer.WriteString(fmt.Sprintf("tpce_latency{%s,quantile=\"0.5\"} %s %d\n", labelString, metrics.P50Latency, now)) + buffer.WriteString(fmt.Sprintf("tpce_latency{%s,quantile=\"0.9\"} %s %d\n", labelString, metrics.P90Latency, now)) + buffer.WriteString(fmt.Sprintf("tpce_latency{%s,quantile=\"0.99\"} %s %d\n", labelString, metrics.P99Latency, now)) + buffer.WriteString(fmt.Sprintf("tpce_latency{%s,quantile=\"1.0\"} %s %d\n", labelString, metrics.PMaxLatency, now)) + buffer.WriteString(fmt.Sprintf("tpce_latency_sum{%s} %d %d\n", labelString, 0, now)) + buffer.WriteString(fmt.Sprintf("tpce_latency_count{%s} %s %d\n", labelString, countOfLatencies, now)) + buffer.WriteString("# EOF") + + metricsBytes := buffer.Bytes() + return metricsBytes +} diff --git a/pkg/cmd/roachtest/tests/tpchbench.go b/pkg/cmd/roachtest/tests/tpchbench.go index fa9de0a6580f..17a7b18c4b2e 100644 --- a/pkg/cmd/roachtest/tests/tpchbench.go +++ b/pkg/cmd/roachtest/tests/tpchbench.go @@ -16,6 +16,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" @@ -78,15 +79,20 @@ func runTPCHBench(ctx context.Context, t test.Test, c cluster.Cluster, b tpchBen // run b.numRunsPerQuery number of times. maxOps := b.numRunsPerQuery * numQueries + labels := map[string]string{ + "max_ops": fmt.Sprintf("%d", maxOps), + "num_queries": fmt.Sprintf("%d", numQueries), + } + // Run with only one worker to get best-case single-query performance. cmd := fmt.Sprintf( "./workload run querybench --db=tpch --concurrency=1 --query-file=%s "+ - "--num-runs=%d --max-ops=%d {pgurl%s} "+ - "--histograms="+t.PerfArtifactsDir()+"/stats.json --histograms-max-latency=%s", + "--num-runs=%d --max-ops=%d {pgurl%s} %s --histograms-max-latency=%s", filename, b.numRunsPerQuery, maxOps, c.CRDBNodes(), + roachtestutil.GetWorkloadHistogramArgs(t, c, labels), b.maxLatency.String(), ) if err := c.RunE(ctx, option.WithNodes(c.WorkloadNode()), cmd); err != nil { diff --git a/pkg/cmd/roachtest/tests/ycsb.go b/pkg/cmd/roachtest/tests/ycsb.go index b3e2eac928a3..a8fe5e33ea0f 100644 --- a/pkg/cmd/roachtest/tests/ycsb.go +++ b/pkg/cmd/roachtest/tests/ycsb.go @@ -88,11 +88,17 @@ func registerYCSB(r registry.Registry) { defaultDuration := roachtestutil.IfLocal(c, "10s", "30m") args += roachtestutil.GetEnvWorkloadDurationValueOrDefault(defaultDuration) + + labels := map[string]string{ + "workload_ycsb_type": wl, + "concurrency": fmt.Sprintf("%d", conc), + "cpu": fmt.Sprintf("%d", cpus), + } + cmd := fmt.Sprintf( "./cockroach workload run ycsb --init --insert-count=1000000 --workload=%s --concurrency=%d"+ - " --splits=%d --histograms="+t.PerfArtifactsDir()+"/stats.json"+args+ - " {pgurl%s}", - wl, conc, len(c.CRDBNodes()), c.CRDBNodes()) + " --splits=%d %s %s {pgurl%s}", wl, conc, len(c.CRDBNodes()), + roachtestutil.GetWorkloadHistogramArgs(t, c, labels), args, c.CRDBNodes()) c.Run(ctx, option.WithNodes(c.WorkloadNode()), cmd) return nil }) diff --git a/pkg/workload/cli/run.go b/pkg/workload/cli/run.go index 9ea322e71627..309684df5172 100644 --- a/pkg/workload/cli/run.go +++ b/pkg/workload/cli/run.go @@ -432,7 +432,7 @@ func runRun(gen workload.Generator, urls []string, dbName string) error { publisher = histogram.CreateUdpPublisher(*individualOperationReceiverAddr) } - metricsExporter, file, err := maybeInitAndCreateExporter() + metricsExporter, file, err := maybeInitAndCreateExporter(gen) if err != nil { return errors.Wrap(err, "error creating metrics exporter") } @@ -666,7 +666,7 @@ func maybeLogRandomSeed(ctx context.Context, gen workload.Generator) { } } -func maybeInitAndCreateExporter() (exporter.Exporter, *os.File, error) { +func maybeInitAndCreateExporter(gen workload.Generator) (exporter.Exporter, *os.File, error) { if *histograms == "" { return nil, nil, nil } @@ -687,6 +687,9 @@ func maybeInitAndCreateExporter() (exporter.Exporter, *os.File, error) { } labels[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1]) } + + // Append workload generator name as a tag + labels["workload"] = gen.Meta().Name openMetricsExporter := exporter.OpenMetricsExporter{} openMetricsExporter.SetLabels(&labels) metricsExporter = &openMetricsExporter diff --git a/pkg/workload/histogram/exporter/BUILD.bazel b/pkg/workload/histogram/exporter/BUILD.bazel index 1373cc5c1670..20f9e3e636fd 100644 --- a/pkg/workload/histogram/exporter/BUILD.bazel +++ b/pkg/workload/histogram/exporter/BUILD.bazel @@ -11,6 +11,7 @@ go_library( importpath = "github.com/cockroachdb/cockroach/pkg/workload/histogram/exporter", visibility = ["//visibility:public"], deps = [ + "//pkg/cmd/roachprod-microbench/util", "@com_github_codahale_hdrhistogram//:hdrhistogram", "@com_github_gogo_protobuf//proto", "@com_github_prometheus_client_model//go", diff --git a/pkg/workload/histogram/exporter/openmetrics_exporter.go b/pkg/workload/histogram/exporter/openmetrics_exporter.go index 389240b6d2e2..ec0b70c50c23 100644 --- a/pkg/workload/histogram/exporter/openmetrics_exporter.go +++ b/pkg/workload/histogram/exporter/openmetrics_exporter.go @@ -11,6 +11,7 @@ import ( "strings" "time" + "github.com/cockroachdb/cockroach/pkg/cmd/roachprod-microbench/util" "github.com/codahale/hdrhistogram" "github.com/gogo/protobuf/proto" prom "github.com/prometheus/client_model/go" @@ -94,6 +95,7 @@ func (o *OpenMetricsExporter) Close(f func() error) error { func (o *OpenMetricsExporter) emitGaugeMetric( name string, value float64, timestamp time.Time, ) error { + name = util.SanitizeMetricName(name) gaugeMetric := prom.MetricFamily{ Name: &name, Help: nil, @@ -139,8 +141,10 @@ func (o *OpenMetricsExporter) SetLabels(labels *map[string]string) { var labelValues []*prom.LabelPair for label, value := range *labels { - labelName := sanitizeOpenMetricsLabels(label) - labelValue := value + labelName := util.SanitizeKey(label) + + // In case the label value already has surrounding quotes, we should trim them + labelValue := util.SanitizeValue(strings.Trim(value, "\"")) labelPair := &prom.LabelPair{ Name: &labelName, Value: &labelValue, diff --git a/pkg/workload/histogram/exporter/util.go b/pkg/workload/histogram/exporter/util.go index 1dc33925fde0..6c4b756460b0 100644 --- a/pkg/workload/histogram/exporter/util.go +++ b/pkg/workload/histogram/exporter/util.go @@ -6,26 +6,18 @@ package exporter import ( - "regexp" "time" + "github.com/cockroachdb/cockroach/pkg/cmd/roachprod-microbench/util" "github.com/codahale/hdrhistogram" "github.com/gogo/protobuf/proto" prom "github.com/prometheus/client_model/go" ) var ( - invalidCharRegex = regexp.MustCompile(`[^a-zA-Z0-9_]`) - invalidFirstCharRegex = regexp.MustCompile(`^[^a-zA-Z_]`) - summaryQuantiles = []float64{50, 95, 99, 100} + summaryQuantiles = []float64{50, 95, 99, 100} ) -func sanitizeOpenMetricsLabels(input string) string { - sanitized := invalidCharRegex.ReplaceAllString(input, "_") - sanitized = invalidFirstCharRegex.ReplaceAllString(sanitized, "_") - return sanitized -} - // ConvertHdrHistogramToPrometheusMetricFamily converts a Hdr histogram into MetricFamily which is used // by expfmt.MetricFamilyToOpenMetrics to export openmetrics func ConvertHdrHistogramToPrometheusMetricFamily( @@ -52,8 +44,9 @@ func ConvertHdrHistogramToPrometheusMetricFamily( summary.Quantile = valueQuantiles summary.SampleCount = &totalCount timestampMs := proto.Int64(start.UTC().UnixMilli()) + sanitizedName := util.SanitizeMetricName(*name) return &prom.MetricFamily{ - Name: name, + Name: &sanitizedName, Type: prom.MetricType_SUMMARY.Enum(), Metric: []*prom.Metric{{ Summary: summary,