Skip to content

Commit

Permalink
Switch away from old style statsd metrics wrappers (#4606)
Browse files Browse the repository at this point in the history
In a handful of places I've nuked old stats which are not used in any alerts or dashboards as they either duplicate other stats or don't provide much insight/have never actually been used. If we feel like we need them again in the future it's trivial to add them back.

There aren't many dashboards that rely on old statsd style metrics, but a few will need to be updated when this change is deployed. There are also a few cases where prometheus labels have been changed from camel to snake case, dashboards that use these will also need to be updated. As far as I can tell no alerts are impacted by this change.

Fixes #4591.
  • Loading branch information
rolandshoemaker authored and Daniel McCarney committed Dec 18, 2019
1 parent 51f30fd commit 5b2f11e
Show file tree
Hide file tree
Showing 57 changed files with 505 additions and 919 deletions.
34 changes: 24 additions & 10 deletions akamai/cache-client.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/letsencrypt/boulder/core"
blog "github.com/letsencrypt/boulder/log"
"github.com/letsencrypt/boulder/metrics"
"github.com/prometheus/client_golang/prometheus"
"golang.org/x/crypto/ocsp"
)

Expand Down Expand Up @@ -53,7 +54,8 @@ type CachePurgeClient struct {
retries int
retryBackoff time.Duration
log blog.Logger
stats metrics.Scope
purgeLatency prometheus.Histogram
purges *prometheus.CounterVec
clk clock.Clock
}

Expand All @@ -79,9 +81,20 @@ func NewCachePurgeClient(
retries int,
retryBackoff time.Duration,
log blog.Logger,
stats metrics.Scope,
stats prometheus.Registerer,
) (*CachePurgeClient, error) {
stats = stats.NewScope("CCU")
purgeLatency := prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "ccu_purge_latency",
Help: "Histogram of latencies of CCU purges",
Buckets: metrics.InternetFacingBuckets,
})
stats.MustRegister(purgeLatency)
purges := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "ccu_purges",
Help: "A counter of CCU purges labelled by the result",
}, []string{"type"})
stats.MustRegister(purges)

if strings.HasSuffix(endpoint, "/") {
endpoint = endpoint[:len(endpoint)-1]
}
Expand All @@ -106,8 +119,9 @@ func NewCachePurgeClient(
retries: retries,
retryBackoff: retryBackoff,
log: log,
stats: stats,
clk: clock.Default(),
purgeLatency: purgeLatency,
purges: purges,
}, nil
}

Expand Down Expand Up @@ -195,9 +209,9 @@ func (cpc *CachePurgeClient) purge(urls []string) error {
cpc.log.Debugf("POSTing to %s with Authorization %s: %s",
endpoint, authHeader, reqJSON)

rS := cpc.clk.Now()
s := cpc.clk.Now()
resp, err := cpc.client.Do(req)
cpc.stats.TimingDuration("PurgeRequestLatency", time.Since(rS))
cpc.purgeLatency.Observe(cpc.clk.Since(s).Seconds())
if err != nil {
return err
}
Expand Down Expand Up @@ -241,23 +255,23 @@ func (cpc *CachePurgeClient) purgeBatch(urls []string) error {
err := cpc.purge(urls)
if err != nil {
if _, ok := err.(errFatal); ok {
cpc.stats.Inc("FatalFailures", 1)
cpc.purges.WithLabelValues("fatal failure").Inc()
return err
}
cpc.log.AuditErrf("Akamai cache purge failed, retrying: %s", err)
cpc.stats.Inc("RetryableFailures", 1)
cpc.purges.WithLabelValues("retryable failure").Inc()
continue
}
successful = true
break
}

if !successful {
cpc.stats.Inc("FatalFailures", 1)
cpc.purges.WithLabelValues("fatal failure").Inc()
return ErrAllRetriesFailed
}

cpc.stats.Inc("SuccessfulPurges", 1)
cpc.purges.WithLabelValues("success").Inc()
return nil
}

Expand Down
12 changes: 6 additions & 6 deletions akamai/cache-client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import (

func TestConstructAuthHeader(t *testing.T) {
log := blog.NewMock()
stats := metrics.NewNoopScope()
stats := metrics.NoopRegisterer
cpc, err := NewCachePurgeClient(
"https://akaa-baseurl-xxxxxxxxxxx-xxxxxxxxxxxxx.luna.akamaiapis.net",
"akab-client-token-xxx-xxxxxxxxxxxxxxxx",
Expand Down Expand Up @@ -137,7 +137,7 @@ func TestV3Purge(t *testing.T) {
3,
time.Second,
blog.NewMock(),
metrics.NewNoopScope(),
metrics.NoopRegisterer,
)
test.AssertNotError(t, err, "Failed to create CachePurgeClient")
fc := clock.NewFake()
Expand Down Expand Up @@ -170,7 +170,7 @@ func TestNewCachePurgeClient(t *testing.T) {
3,
time.Second,
blog.NewMock(),
metrics.NewNoopScope(),
metrics.NoopRegisterer,
)
test.AssertError(t, err, "NewCachePurgeClient with invalid network parameter didn't error")

Expand All @@ -184,7 +184,7 @@ func TestNewCachePurgeClient(t *testing.T) {
3,
time.Second,
blog.NewMock(),
metrics.NewNoopScope(),
metrics.NoopRegisterer,
)
test.AssertNotError(t, err, "NewCachePurgeClient with valid network parameter errored")

Expand All @@ -198,7 +198,7 @@ func TestNewCachePurgeClient(t *testing.T) {
3,
time.Second,
blog.NewMock(),
metrics.NewNoopScope(),
metrics.NoopRegisterer,
)
test.AssertError(t, err, "NewCachePurgeClient with invalid server url parameter didn't error")
}
Expand All @@ -223,7 +223,7 @@ func TestBigBatchPurge(t *testing.T) {
3,
time.Second,
log,
metrics.NewNoopScope(),
metrics.NoopRegisterer,
)
test.AssertNotError(t, err, "Failed to create CachePurgeClient")

Expand Down
5 changes: 2 additions & 3 deletions bdns/dns.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,11 @@ type exchanger interface {
func NewDNSClientImpl(
readTimeout time.Duration,
servers []string,
stats metrics.Scope,
stats prometheus.Registerer,
clk clock.Clock,
maxTries int,
log blog.Logger,
) *DNSClientImpl {
stats = stats.NewScope("DNS")
dnsClient := new(dns.Client)

// Set timeout for underlying net.Conn
Expand Down Expand Up @@ -242,7 +241,7 @@ func NewDNSClientImpl(
func NewTestDNSClientImpl(
readTimeout time.Duration,
servers []string,
stats metrics.Scope,
stats prometheus.Registerer,
clk clock.Clock,
maxTries int,
log blog.Logger) *DNSClientImpl {
Expand Down
30 changes: 12 additions & 18 deletions bdns/dns_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -241,38 +241,32 @@ func TestMain(m *testing.M) {
os.Exit(ret)
}

func newTestStats() metrics.Scope {
return metrics.NewNoopScope()
}

var testStats = newTestStats()

func TestDNSNoServers(t *testing.T) {
obj := NewTestDNSClientImpl(time.Hour, []string{}, testStats, clock.NewFake(), 1, blog.UseMock())
obj := NewTestDNSClientImpl(time.Hour, []string{}, metrics.NoopRegisterer, clock.NewFake(), 1, blog.UseMock())

_, err := obj.LookupHost(context.Background(), "letsencrypt.org")

test.AssertError(t, err, "No servers")
}

func TestDNSOneServer(t *testing.T) {
obj := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, testStats, clock.NewFake(), 1, blog.UseMock())
obj := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, metrics.NoopRegisterer, clock.NewFake(), 1, blog.UseMock())

_, err := obj.LookupHost(context.Background(), "letsencrypt.org")

test.AssertNotError(t, err, "No message")
}

func TestDNSDuplicateServers(t *testing.T) {
obj := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr, dnsLoopbackAddr}, testStats, clock.NewFake(), 1, blog.UseMock())
obj := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr, dnsLoopbackAddr}, metrics.NoopRegisterer, clock.NewFake(), 1, blog.UseMock())

_, err := obj.LookupHost(context.Background(), "letsencrypt.org")

test.AssertNotError(t, err, "No message")
}

func TestDNSLookupsNoServer(t *testing.T) {
obj := NewTestDNSClientImpl(time.Second*10, []string{}, testStats, clock.NewFake(), 1, blog.UseMock())
obj := NewTestDNSClientImpl(time.Second*10, []string{}, metrics.NoopRegisterer, clock.NewFake(), 1, blog.UseMock())

_, err := obj.LookupTXT(context.Background(), "letsencrypt.org")
test.AssertError(t, err, "No servers")
Expand All @@ -285,7 +279,7 @@ func TestDNSLookupsNoServer(t *testing.T) {
}

func TestDNSServFail(t *testing.T) {
obj := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, testStats, clock.NewFake(), 1, blog.UseMock())
obj := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, metrics.NoopRegisterer, clock.NewFake(), 1, blog.UseMock())
bad := "servfail.com"

_, err := obj.LookupTXT(context.Background(), bad)
Expand All @@ -300,7 +294,7 @@ func TestDNSServFail(t *testing.T) {
}

func TestDNSLookupTXT(t *testing.T) {
obj := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, testStats, clock.NewFake(), 1, blog.UseMock())
obj := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, metrics.NoopRegisterer, clock.NewFake(), 1, blog.UseMock())

a, err := obj.LookupTXT(context.Background(), "letsencrypt.org")
t.Logf("A: %v", a)
Expand All @@ -314,7 +308,7 @@ func TestDNSLookupTXT(t *testing.T) {
}

func TestDNSLookupHost(t *testing.T) {
obj := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, testStats, clock.NewFake(), 1, blog.UseMock())
obj := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, metrics.NoopRegisterer, clock.NewFake(), 1, blog.UseMock())

ip, err := obj.LookupHost(context.Background(), "servfail.com")
t.Logf("servfail.com - IP: %s, Err: %s", ip, err)
Expand Down Expand Up @@ -381,7 +375,7 @@ func TestDNSLookupHost(t *testing.T) {
}

func TestDNSNXDOMAIN(t *testing.T) {
obj := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, testStats, clock.NewFake(), 1, blog.UseMock())
obj := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, metrics.NoopRegisterer, clock.NewFake(), 1, blog.UseMock())

hostname := "nxdomain.letsencrypt.org"
_, err := obj.LookupHost(context.Background(), hostname)
Expand All @@ -398,7 +392,7 @@ func TestDNSNXDOMAIN(t *testing.T) {
}

func TestDNSLookupCAA(t *testing.T) {
obj := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, testStats, clock.NewFake(), 1, blog.UseMock())
obj := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, metrics.NoopRegisterer, clock.NewFake(), 1, blog.UseMock())

caas, err := obj.LookupCAA(context.Background(), "bracewel.net")
test.AssertNotError(t, err, "CAA lookup failed")
Expand Down Expand Up @@ -585,7 +579,7 @@ func TestRetry(t *testing.T) {
}

for i, tc := range tests {
dr := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, testStats, clock.NewFake(), tc.maxTries, blog.UseMock())
dr := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, metrics.NoopRegisterer, clock.NewFake(), tc.maxTries, blog.UseMock())
dr.dnsClient = tc.te
_, err := dr.LookupTXT(context.Background(), "example.com")
if err == errTooManyRequests {
Expand All @@ -612,7 +606,7 @@ func TestRetry(t *testing.T) {
}
}

dr := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, testStats, clock.NewFake(), 3, blog.UseMock())
dr := NewTestDNSClientImpl(time.Second*10, []string{dnsLoopbackAddr}, metrics.NoopRegisterer, clock.NewFake(), 3, blog.UseMock())
dr.dnsClient = &testExchanger{errs: []error{isTempErr, isTempErr, nil}}
ctx, cancel := context.WithCancel(context.Background())
cancel()
Expand Down Expand Up @@ -705,7 +699,7 @@ func TestRotateServerOnErr(t *testing.T) {
// number of dnsServers to ensure we always get around to trying the one
// working server
maxTries := 5
client := NewTestDNSClientImpl(time.Second*10, dnsServers, testStats, clock.NewFake(), maxTries, blog.UseMock())
client := NewTestDNSClientImpl(time.Second*10, dnsServers, metrics.NoopRegisterer, clock.NewFake(), maxTries, blog.UseMock())

// Configure a mock exchanger that will always return a retryable error for
// the A and B servers. This will force the C server to do all the work once
Expand Down
22 changes: 11 additions & 11 deletions ca/ca.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ import (
"github.com/letsencrypt/boulder/features"
"github.com/letsencrypt/boulder/goodkey"
blog "github.com/letsencrypt/boulder/log"
"github.com/letsencrypt/boulder/metrics"
sapb "github.com/letsencrypt/boulder/sa/proto"
)

Expand Down Expand Up @@ -90,10 +89,6 @@ var (

// Metrics for CA statistics
const (
// Increments when CA observes an HSM or signing error
metricSigningError = "SigningError"
metricHSMError = metricSigningError + ".HSMError"

csrExtensionCategory = "category"
csrExtensionBasic = "basic"
csrExtensionTLSFeature = "tls-feature"
Expand Down Expand Up @@ -131,7 +126,6 @@ type CertificateAuthorityImpl struct {
keyPolicy goodkey.KeyPolicy
clk clock.Clock
log blog.Logger
stats metrics.Scope
prefix int // Prepended to the serial number
validityPeriod time.Duration
backdate time.Duration
Expand All @@ -141,6 +135,7 @@ type CertificateAuthorityImpl struct {
csrExtensionCount *prometheus.CounterVec
orphanCount *prometheus.CounterVec
adoptedOrphanCount *prometheus.CounterVec
signErrorCounter *prometheus.CounterVec
orphanQueue *goque.Queue
ocspLifetime time.Duration
}
Expand Down Expand Up @@ -213,7 +208,7 @@ func NewCertificateAuthorityImpl(
sa certificateStorage,
pa core.PolicyAuthority,
clk clock.Clock,
stats metrics.Scope,
stats prometheus.Registerer,
issuers []Issuer,
keyPolicy goodkey.KeyPolicy,
logger blog.Logger,
Expand Down Expand Up @@ -266,7 +261,7 @@ func NewCertificateAuthorityImpl(

csrExtensionCount := prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "csrExtensions",
Name: "csr_extensions",
Help: "Number of CSRs with extensions of the given category",
},
[]string{csrExtensionCategory})
Expand Down Expand Up @@ -296,6 +291,11 @@ func NewCertificateAuthorityImpl(
[]string{"type"})
stats.MustRegister(adoptedOrphanCount)

signErrorCounter := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "signature_errors",
Help: "A counter of signature errors labelled by error type",
}, []string{"type"})

ca = &CertificateAuthorityImpl{
sa: sa,
pa: pa,
Expand All @@ -306,7 +306,6 @@ func NewCertificateAuthorityImpl(
prefix: config.SerialPrefix,
clk: clk,
log: logger,
stats: stats,
keyPolicy: keyPolicy,
forceCNFromSAN: !config.DoNotForceCN, // Note the inversion here
signatureCount: signatureCount,
Expand All @@ -315,6 +314,7 @@ func NewCertificateAuthorityImpl(
adoptedOrphanCount: adoptedOrphanCount,
orphanQueue: orphanQueue,
ocspLifetime: config.LifespanOCSP.Duration,
signErrorCounter: signErrorCounter,
}

ca.idToIssuer = make(map[int64]*internalIssuer)
Expand Down Expand Up @@ -349,9 +349,9 @@ func NewCertificateAuthorityImpl(
func (ca *CertificateAuthorityImpl) noteSignError(err error) {
if err != nil {
if _, ok := err.(*pkcs11.Error); ok {
ca.stats.Inc(metricHSMError, 1)
ca.signErrorCounter.WithLabelValues("HSM").Inc()
} else if cfErr, ok := err.(*cferr.Error); ok {
ca.stats.Inc(fmt.Sprintf("%s.%d", metricSigningError, cfErr.ErrorCode), 1)
ca.signErrorCounter.WithLabelValues(fmt.Sprintf("CFSSL %d", cfErr.ErrorCode)).Inc()
}
}
return
Expand Down
Loading

0 comments on commit 5b2f11e

Please sign in to comment.