Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(cosmovisor): Add a restart delay after halt and before backup #12188

Merged
merged 10 commits into from
Jun 9, 2022
1 change: 1 addition & 0 deletions cosmovisor/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Ref: https://keepachangelog.com/en/1.0.0/

### Features

* [\#12188](https://github.com/cosmos/cosmos-sdk/pull/12188) Add a `DAEMON_RESTART_DELAY` for allowing a node operator to define a delay between the node halt (for upgrade) and backup.
* [\#11823](https://github.com/cosmos/cosmos-sdk/pull/11823) Refactor `cosmovisor` CLI to use `cobra`.
* [\#11731](https://github.com/cosmos/cosmos-sdk/pull/11731) `cosmovisor version -o json` returns the cosmovisor version and the result of `simd --output json --long` in one JSON object.

Expand Down
3 changes: 2 additions & 1 deletion cosmovisor/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ All arguments passed to `cosmovisor run` will be passed to the application binar
* `DAEMON_NAME` is the name of the binary itself (e.g. `gaiad`, `regend`, `simd`, etc.).
* `DAEMON_ALLOW_DOWNLOAD_BINARIES` (*optional*), if set to `true`, will enable auto-downloading of new binaries (for security reasons, this is intended for full nodes rather than validators). By default, `cosmovisor` will not auto-download new binaries.
* `DAEMON_RESTART_AFTER_UPGRADE` (*optional*, default = `true`), if `true`, restarts the subprocess with the same command-line arguments and flags (but with the new binary) after a successful upgrade. Otherwise (`false`), `cosmovisor` stops running after an upgrade and requires the system administrator to manually restart it. Note restart is only after the upgrade and does not auto-restart the subprocess after an error occurs.
* `DAEMON_POLL_INTERVAL` is the interval length for polling the upgrade plan file. The value can either be a number (in milliseconds) or a duration (e.g. `1s`). Default: 300 milliseconds.
* `DAEMON_RESTART_DELAY` (*optional*, default none), allow a node operator to define a delay between the node halt (for upgrade) and backup by the specified time. The value can either be a number (in milliseconds) or a duration (e.g. `1s`).
julienrbrt marked this conversation as resolved.
Show resolved Hide resolved
* `DAEMON_POLL_INTERVAL` (*optional*, default 300 milliseconds), is the interval length for polling the upgrade plan file. The value can either be a number (in milliseconds) or a duration (e.g. `1s`).
* `DAEMON_BACKUP_DIR` option to set a custom backup directory. If not set, `DAEMON_HOME` is used.
* `UNSAFE_SKIP_BACKUP` (defaults to `false`), if set to `true`, upgrades directly without performing a backup. Otherwise (`false`, default) backs up the data before trying the upgrade. The default value of false is useful and recommended in case of failures and when a backup needed to rollback. We recommend using the default backup option `UNSAFE_SKIP_BACKUP=false`.
* `DAEMON_PREUPGRADE_MAX_RETRIES` (defaults to `0`). The maximum number of times to call `pre-upgrade` in the application after exit status of `31`. After the maximum number of retries, cosmovisor fails the upgrade.
Expand Down
72 changes: 56 additions & 16 deletions cosmovisor/args.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package cosmovisor

import (
"encoding/json"
"errors"
"fmt"
"net/url"
"os"
Expand All @@ -23,6 +22,7 @@ const (
EnvName = "DAEMON_NAME"
EnvDownloadBin = "DAEMON_ALLOW_DOWNLOAD_BINARIES"
EnvRestartUpgrade = "DAEMON_RESTART_AFTER_UPGRADE"
EnvRestartDelay = "DAEMON_RESTART_DELAY"
EnvSkipBackup = "UNSAFE_SKIP_BACKUP"
EnvDataBackupPath = "DAEMON_DATA_BACKUP_DIR"
EnvInterval = "DAEMON_POLL_INTERVAL"
Expand All @@ -45,6 +45,7 @@ type Config struct {
Name string
AllowDownloadBinaries bool
RestartAfterUpgrade bool
RestartDelay time.Duration
PollInterval time.Duration
UnsafeSkipBackup bool
DataBackupPath string
Expand Down Expand Up @@ -97,6 +98,13 @@ func (cfg *Config) SymLinkToGenesis() (string, error) {
return cfg.GenesisBin(), nil
}

// WaitRestartDelay wait the defined restart delay
julienrbrt marked this conversation as resolved.
Show resolved Hide resolved
func (cfg *Config) WaitRestartDelay() {
if cfg.RestartDelay > 0 {
time.Sleep(cfg.RestartDelay)
}
}

// CurrentBin is the path to the currently selected binary (genesis if no link is set)
// This will resolve the symlink to the underlying directory to make it easier to debug
func (cfg *Config) CurrentBin() (string, error) {
Expand Down Expand Up @@ -152,23 +160,28 @@ func GetConfigFromEnv() (*Config, error) {

interval := os.Getenv(EnvInterval)
if interval != "" {
var intervalUInt uint64
intervalUInt, err = strconv.ParseUint(interval, 10, 32)
if err == nil {
cfg.PollInterval = time.Millisecond * time.Duration(intervalUInt)
val, err := parseEnvDuration(interval)
if err != nil {
errs = append(errs, fmt.Errorf("invalid: %s: %w", EnvInterval, err))
} else {
cfg.PollInterval, err = time.ParseDuration(interval)
}
switch {
case err != nil:
errs = append(errs, fmt.Errorf("invalid %s: could not parse \"%s\" into either a duration or uint (milliseconds)", EnvInterval, interval))
case cfg.PollInterval <= 0:
errs = append(errs, fmt.Errorf("invalid %s: must be greater than 0", EnvInterval))
cfg.PollInterval = val
}
} else {
cfg.PollInterval = 300 * time.Millisecond
}

restartDelay := os.Getenv(EnvRestartDelay)
if restartDelay != "" {
val, err := parseEnvDuration(restartDelay)
if err != nil {
errs = append(errs, fmt.Errorf("invalid: %s: %w", EnvRestartDelay, err))
} else {
cfg.RestartDelay = val
}
} else {
cfg.RestartDelay = 0 // default value but still setting for clarity
}
julienrbrt marked this conversation as resolved.
Show resolved Hide resolved

envPreupgradeMaxRetriesVal := os.Getenv(EnvPreupgradeMaxRetries)
if cfg.PreupgradeMaxRetries, err = strconv.Atoi(envPreupgradeMaxRetriesVal); err != nil && envPreupgradeMaxRetriesVal != "" {
errs = append(errs, fmt.Errorf("%s could not be parsed to int: %w", EnvPreupgradeMaxRetries, err))
Expand All @@ -182,6 +195,27 @@ func GetConfigFromEnv() (*Config, error) {
return cfg, nil
}

func parseEnvDuration(input string) (time.Duration, error) {
var duration time.Duration

inputUInt, err := strconv.ParseUint(input, 10, 32)
if err == nil {
duration = time.Millisecond * time.Duration(inputUInt)
} else {
duration, err = time.ParseDuration(input)
}

if err != nil {
return 0, fmt.Errorf("could not parse \"%s\" into either a duration or uint (milliseconds)", input)
}

if duration <= 0 {
return 0, fmt.Errorf("must be greater than 0")
}

return duration, nil
}

// LogConfigOrError logs either the config details or the error.
func LogConfigOrError(logger *zerolog.Logger, cfg *Config, err error) {
if cfg == nil && err == nil {
Expand All @@ -201,15 +235,18 @@ func LogConfigOrError(logger *zerolog.Logger, cfg *Config, err error) {
// and that Name is set
func (cfg *Config) validate() []error {
var errs []error

// validate EnvName
if cfg.Name == "" {
errs = append(errs, errors.New(EnvName+" is not set"))
errs = append(errs, fmt.Errorf("%s is not set", EnvName))
}

// validate EnvHome
switch {
case cfg.Home == "":
errs = append(errs, errors.New(EnvHome+" is not set"))
errs = append(errs, fmt.Errorf("%s is not set", EnvHome))
case !filepath.IsAbs(cfg.Home):
errs = append(errs, errors.New(EnvHome+" must be an absolute path"))
errs = append(errs, fmt.Errorf("%s must be an absolute path", EnvHome))
default:
switch info, err := os.Stat(cfg.Root()); {
case err != nil:
Expand All @@ -223,7 +260,8 @@ func (cfg *Config) validate() []error {
if cfg.UnsafeSkipBackup == true {
return errs
}
// if UnsafeSkipBackup is false, check if the DataBackupPath valid

// if UnsafeSkipBackup is false, validate DataBackupPath
switch {
case cfg.DataBackupPath == "":
errs = append(errs, fmt.Errorf("%s must not be empty", EnvDataBackupPath))
Expand Down Expand Up @@ -327,11 +365,13 @@ func (cfg Config) DetailString() string {
{EnvName, cfg.Name},
{EnvDownloadBin, fmt.Sprintf("%t", cfg.AllowDownloadBinaries)},
{EnvRestartUpgrade, fmt.Sprintf("%t", cfg.RestartAfterUpgrade)},
{EnvRestartDelay, fmt.Sprintf("%s", cfg.RestartDelay)},
{EnvInterval, fmt.Sprintf("%s", cfg.PollInterval)},
{EnvSkipBackup, fmt.Sprintf("%t", cfg.UnsafeSkipBackup)},
{EnvDataBackupPath, cfg.DataBackupPath},
{EnvPreupgradeMaxRetries, fmt.Sprintf("%d", cfg.PreupgradeMaxRetries)},
}

derivedEntries := []struct{ name, value string }{
{"Root Dir", cfg.Root()},
{"Upgrade Dir", cfg.BaseUpgradeDir()},
Expand Down
Loading