Skip to content

Add a lifetime manager for Vault authentication tokens #7337

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Feb 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
* [FEATURE] Alertmanager API: added `-alertmanager.grafana-alertmanager-compatibility-enabled` CLI flag (and respective YAML config option) to enable an experimental API endpoints that support the migration of the Grafana Alertmanager. #7057
* [FEATURE] Alertmanager: Added `-alertmanager.utf8-strict-mode-enabled` to control support for any UTF-8 character as part of Alertmanager configuration/API matchers and labels. It's default value is set to `false`. #6898
* [FEATURE] Querier: added `histogram_avg()` function support to PromQL. #7293
* [ENHANCEMENT] Vault: add lifecycle manager for token used to authenticate to Vault. This ensures the client token is always valid. Includes a gauge (`cortex_vault_token_lease_renewal_active`) to check whether token renewal is active, and the counters `cortex_vault_token_lease_renewal_success_total` and `cortex_vault_auth_success_total` to see the total number of successful lease renewals / authentications. #7337
* [ENHANCEMENT] Store-gateway: add no-compact details column on store-gateway tenants admin UI. #6848
* [ENHANCEMENT] PromQL: ignore small errors for bucketQuantile #6766
* [ENHANCEMENT] Distributor: improve efficiency of some errors #6785
Expand Down
7 changes: 6 additions & 1 deletion docs/sources/mimir/configure/about-versioning.md
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,12 @@ The following features are currently experimental:
- Metric separation by an additionally configured group label
- `-validation.separate-metrics-group-label`
- `-max-separate-metrics-groups-per-user`
- Fetching TLS secrets from Vault for various clients (`-vault.enabled`)
- Vault
- Fetching TLS secrets from Vault for various clients (`-vault.enabled`)
- Vault client authentication token lifetime watcher. Ensures the client token is always valid by renewing the token lease or re-authenticating. Includes the metrics:
- `cortex_vault_token_lease_renewal_active`
- `cortex_vault_token_lease_renewal_success_total`
- `cortex_vault_auth_success_total`
- Logger
- Rate limited logger support
- `log.rate-limit-enabled`
Expand Down
7 changes: 7 additions & 0 deletions integration/images.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// SPDX-License-Identifier: AGPL-3.0-only

package integration

var (
VaultImage = "hashicorp/vault:1.13.2"
)
82 changes: 82 additions & 0 deletions integration/vault_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// SPDX-License-Identifier: AGPL-3.0-only
//go:build requires_docker

package integration

import (
"fmt"
"testing"

"github.com/grafana/e2e"
e2edb "github.com/grafana/e2e/db"
hashivault "github.com/hashicorp/vault/api"
"github.com/stretchr/testify/require"

"github.com/grafana/mimir/integration/e2emimir"
)

func TestVaultTokenRenewal(t *testing.T) {
const devToken = "dev_token"
const httpPort = 8200

s, err := e2e.NewScenario(networkName)
require.NoError(t, err)
defer s.Close()

// Initialize Vault
vault := e2e.NewHTTPService(
"vault",
VaultImage,
nil,
e2e.NewHTTPReadinessProbe(httpPort, "/v1/sys/health", 200, 200),
httpPort,
)
vault.SetEnvVars(map[string]string{"VAULT_DEV_ROOT_TOKEN_ID": devToken})
require.NoError(t, s.StartAndWaitReady(vault))

cli, err := hashivault.NewClient(&hashivault.Config{Address: fmt.Sprintf("http://%s", vault.HTTPEndpoint())})
require.NoError(t, err)

cli.SetToken(devToken)

err = cli.Sys().EnableAuthWithOptions("userpass", &hashivault.EnableAuthOptions{
Type: "userpass",
})
require.NoError(t, err)

_, err = cli.Logical().Write("auth/userpass/users/foo", map[string]interface{}{
"password": "bar",
"ttl": "5s",
"max_ttl": "10s",
})
require.NoError(t, err)

consul := e2edb.NewConsul()
minio := e2edb.NewMinio(9000, blocksBucketName)
require.NoError(t, s.StartAndWaitReady(consul, minio))

flags := mergeFlags(
BlocksStorageFlags(),
BlocksStorageS3Flags(),
map[string]string{
"-vault.enabled": "true",
"-vault.url": fmt.Sprintf("http://%s", vault.NetworkHTTPEndpoint()),
"-vault.mount-path": "secret",
"-vault.auth.type": "userpass",
"-vault.auth.userpass.username": "foo",
"-vault.auth.userpass.password": "bar",
"-log.level": "debug",
},
)

// Start Mimir
mimir := e2emimir.NewSingleBinary("mimir-1", e2e.MergeFlags(DefaultSingleBinaryFlags(), flags))
require.NoError(t, s.StartAndWaitReady(mimir))

// Check that token renewal is active
require.NoError(t, mimir.WaitSumMetrics(e2e.Equals(1), "cortex_vault_token_lease_renewal_active"))
// Check that the token lease has been updated before hitting max_ttl
require.NoError(t, mimir.WaitSumMetrics(e2e.GreaterOrEqual(2), "cortex_vault_token_lease_renewal_success_total"))
// Check that re-authentication occurred
require.NoError(t, mimir.WaitSumMetrics(e2e.Equals(2), "cortex_vault_auth_success_total"))
}
11 changes: 9 additions & 2 deletions pkg/mimir/modules.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ func (t *Mimir) initVault() (services.Service, error) {
return nil, nil
}

v, err := vault.NewVault(t.Cfg.Vault)
v, err := vault.NewVault(t.Cfg.Vault, util_log.Logger, t.Registerer)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -264,7 +264,14 @@ func (t *Mimir) initVault() (services.Service, error) {
}
}

return nil, nil
runFunc := func(ctx context.Context) error {
err := t.Vault.KeepRenewingTokenLease(ctx)
// We don't want to turn Mimir into an unready state if Vault fails here
<-ctx.Done()
return err
}

return services.NewBasicService(nil, runFunc, nil), nil
}

func (t *Mimir) initSanityCheck() (services.Service, error) {
Expand Down
116 changes: 102 additions & 14 deletions pkg/vault/vault.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@ import (
"errors"
"flag"
"fmt"
"time"

"github.com/go-kit/log"
"github.com/go-kit/log/level"
hashivault "github.com/hashicorp/vault/api"
"github.com/hashicorp/vault/api/auth/approle"
"github.com/hashicorp/vault/api/auth/kubernetes"
"github.com/hashicorp/vault/api/auth/userpass"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)

// Config for the Vault used to fetch secrets
Expand Down Expand Up @@ -57,13 +62,22 @@ type SecretsEngine interface {
}

type Vault struct {
KVStore SecretsEngine
kvStore SecretsEngine
auth AuthConfig

client *hashivault.Client
token *hashivault.Secret
logger log.Logger

authLeaseRenewalActive prometheus.Gauge
authLeaseRenewalSuccessTotal prometheus.Counter
authSuccessTotal prometheus.Counter
}

func NewVault(cfg Config) (*Vault, error) {
func NewVault(cfg Config, l log.Logger, registerer prometheus.Registerer) (*Vault, error) {
if cfg.Mock != nil {
return &Vault{
KVStore: cfg.Mock,
kvStore: cfg.Mock,
}, nil
}

Expand All @@ -75,26 +89,37 @@ func NewVault(cfg Config) (*Vault, error) {
return nil, err
}

authMethod, err := cfg.Auth.authMethod()
authToken, err := getAuthToken(context.Background(), &cfg.Auth, client)
if err != nil {
return nil, err
}

authFac := authFactoryReal{}
_, err = authMethod.authenticate(context.Background(), &authFac, client)
if err != nil {
return nil, fmt.Errorf("error authenticating to vault: %w", err)
return nil, fmt.Errorf("failed to get auth token from vault: %v", err)
}

vault := &Vault{
KVStore: client.KVv2(cfg.MountPath),
kvStore: client.KVv2(cfg.MountPath),
auth: cfg.Auth,
token: authToken,
client: client,
logger: l,
authLeaseRenewalActive: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{
Name: "cortex_vault_token_lease_renewal_active",
Help: "Gauge to check whether token renewal is active or not (0 active, 1 inactive).",
}),
authLeaseRenewalSuccessTotal: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
Name: "cortex_vault_token_lease_renewal_success_total",
Help: "Number of successful token lease renewals. Token is renewed as it approaches the end of its ttl.",
}),
authSuccessTotal: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
Name: "cortex_vault_auth_success_total",
Help: "Number of successful authentications. Authentication occurs when the token has reached its max_ttl and the lease can no longer be renewed.",
}),
}
vault.authSuccessTotal.Inc()

return vault, nil
}

func (v *Vault) ReadSecret(path string) ([]byte, error) {
secret, err := v.KVStore.Get(context.Background(), path)
secret, err := v.kvStore.Get(context.Background(), path)

if err != nil {
return nil, fmt.Errorf("unable to read secret from vault: %v", err)
Expand All @@ -106,14 +131,77 @@ func (v *Vault) ReadSecret(path string) ([]byte, error) {

data, ok := secret.Data["value"].(string)
if !ok {
return nil, fmt.Errorf("secret data type is not string, found %T value: %#v", secret.Data["value"], secret.Data["value"])
return nil, fmt.Errorf("secret data type is not string, found %T value: %#v at path: %s", secret.Data["value"], secret.Data["value"], path)
}

return []byte(data), nil
}

func (v *Vault) manageTokenLifecycle(ctx context.Context, authTokenWatcher *hashivault.LifetimeWatcher) {
go authTokenWatcher.Start()
defer authTokenWatcher.Stop()

for {
select {
case <-ctx.Done():
return

case <-authTokenWatcher.DoneCh():
// Token failed to renew (e.g expired), re-auth required
return

case renewalInfo := <-authTokenWatcher.RenewCh():
// Token was successfully renewed
if renewalInfo.Secret.Auth != nil {
level.Debug(v.logger).Log("msg", "token renewed", "leaseDuration", time.Duration(renewalInfo.Secret.Auth.LeaseDuration)*time.Second)
v.authLeaseRenewalSuccessTotal.Inc()
}
}
}
}

func (v *Vault) KeepRenewingTokenLease(ctx context.Context) error {
v.authLeaseRenewalActive.Inc()
defer v.authLeaseRenewalActive.Dec()

for ctx.Err() == nil {
authTokenWatcher, err := v.client.NewLifetimeWatcher(&hashivault.LifetimeWatcherInput{
Secret: v.token,
})
if err != nil {
return fmt.Errorf("error initializing auth token lifetime watcher: %v", err)
}

v.manageTokenLifecycle(ctx, authTokenWatcher)

if ctx.Err() != nil {
return ctx.Err()
}

newAuthToken, err := getAuthToken(ctx, &v.auth, v.client)
if err != nil {
level.Error(v.logger).Log("msg", "error during re-authentication after token expiry", "err", err)
return err
}

v.authSuccessTotal.Inc()
v.token = newAuthToken
}

return nil
}

type authFactoryReal struct{}

func getAuthToken(ctx context.Context, authCfg *AuthConfig, client *hashivault.Client) (*hashivault.Secret, error) {
am, err := authCfg.authMethod()
if err != nil {
return nil, err
}

return am.authenticate(ctx, &authFactoryReal{}, client)
}

func (af *authFactoryReal) NewAppRoleAuth(roleID string, secretID *approle.SecretID, opts ...approle.LoginOption) (*approle.AppRoleAuth, error) {
return approle.NewAppRoleAuth(
roleID,
Expand Down
2 changes: 1 addition & 1 deletion pkg/vault/vault_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import (
func TestReadSecret(t *testing.T) {
mockKVStore := newMockKVStore()
mimirVaultClient := Vault{
KVStore: mockKVStore,
kvStore: mockKVStore,
}

tests := map[string]struct {
Expand Down
3 changes: 3 additions & 0 deletions tools/pre-pull-images/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ func main() {
fmt.Println(images.Memcached)
fmt.Println(images.Kafka)

// vault image
fmt.Println(integration.VaultImage)

// images from previous releases
for image := range integration.DefaultPreviousVersionImages {
fmt.Println(image)
Expand Down