Skip to content

Commit

Permalink
Retry upgrade downloads (#2776)
Browse files Browse the repository at this point in the history
* Initial implementation

* Adding FIXME

* Fixing FIXME comment location

* Updating some timeouts for testing

* Removing extra wait

* Adding CHANGELOG fragment

* Remove commented out const

* Use backoff library

* Add test case (comment)

* Changing version to make testing with Fleet easier

* Add unit test

* Check log entry in unit test

* Fleshing out unit tests

* Use passed-in context to create new context

* Using consistent naming for setting

* Updating integration test fixtures

* Update internal/pkg/agent/application/upgrade/artifact/config.go

Co-authored-by: Michal Pristas <michal.pristas@gmail.com>

* Update internal/pkg/agent/application/upgrade/artifact/config.go

Co-authored-by: Michal Pristas <michal.pristas@gmail.com>

* Remove testing-only changes

* Running mage fmt

* Update internal/pkg/agent/application/upgrade/step_download_test.go

Co-authored-by: Tiago Queiroz <contato@tiago.eti.br>

* Updating default configuration files

* Running mage fmt

* Updating template config files

* Removing WIP file

* Adding unit test for timeout expiring

* Clarifying comment

* Updating unit test

* Removing retry_max_count setting

---------

Co-authored-by: Michal Pristas <michal.pristas@gmail.com>
Co-authored-by: Tiago Queiroz <contato@tiago.eti.br>
  • Loading branch information
3 people authored Jun 5, 2023
1 parent da02074 commit 695bd42
Show file tree
Hide file tree
Showing 14 changed files with 288 additions and 42 deletions.
60 changes: 30 additions & 30 deletions NOTICE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,36 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
Dependency : github.com/cenkalti/backoff/v4
Version: v4.1.2
Licence type (autodetected): MIT
--------------------------------------------------------------------------------

Contents of probable licence file $GOMODCACHE/github.com/cenkalti/backoff/v4@v4.1.2/LICENSE:

The MIT License (MIT)

Copyright (c) 2014 Cenk Altı

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
Dependency : github.com/coreos/go-systemd/v22
Version: v22.3.3-0.20220203105225-a9a7ef127534
Expand Down Expand Up @@ -8086,36 +8116,6 @@ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
Dependency : github.com/cenkalti/backoff/v4
Version: v4.1.2
Licence type (autodetected): MIT
--------------------------------------------------------------------------------

Contents of probable licence file $GOMODCACHE/github.com/cenkalti/backoff/v4@v4.1.2/LICENSE:

The MIT License (MIT)

Copyright (c) 2014 Cenk Altı

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
Dependency : github.com/cyphar/filepath-securejoin
Version: v0.2.3
Expand Down
3 changes: 3 additions & 0 deletions _meta/config/common.p2.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ inputs:
# # install_path describes the location of installed packages/programs. It is also used
# # for reading program specifications.
# install_path: "${path.data}/install"
# # retry_sleep_init_duration is the duration to sleep for before the first retry attempt. This
# # duration will increase for subsequent retry attempts in a randomized exponential backoff manner.
# retry_sleep_init_duration: 30s

# agent.process:
# # timeout for creating new processes. when process is not successfully created by this timeout
Expand Down
3 changes: 3 additions & 0 deletions _meta/config/common.reference.p2.yml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ inputs:
# # install_path describes the location of installed packages/programs. It is also used
# # for reading program specifications.
# install_path: "${path.data}/install"
# # retry_sleep_init_duration is the duration to sleep for before the first retry attempt. This
# # duration will increase for subsequent retry attempts in a randomized exponential backoff manner.
# retry_sleep_init_duration: 30s

# agent.process:
# # timeout for creating new processes. when process is not successfully created by this timeout
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: bug-fix

# Change summary; a 80ish characters long description of the change.
summary: Retry download step in upgrade process

# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# NOTE: This field will be rendered only for breaking-change and known-issue kinds at the moment.
#description:

# Affected component; a word indicating the component this changeset affects.
component: agent

# PR URL; optional; the PR number that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
pr: https://github.com/elastic/elastic-agent/pull/2776

# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
#issue: https://github.com/owner/repo/1234
3 changes: 3 additions & 0 deletions elastic-agent.reference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ inputs:
# # install_path describes the location of installed packages/programs. It is also used
# # for reading program specifications.
# install_path: "${path.data}/install"
# # retry_sleep_init_duration is the duration to sleep for before the first retry attempt. This
# # duration will increase for subsequent retry attempts in a randomized exponential backoff manner.
# retry_sleep_init_duration: 30s

# agent.process:
# # timeout for creating new processes. when process is not successfully created by this timeout
Expand Down
3 changes: 3 additions & 0 deletions elastic-agent.yml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ inputs:
# # install_path describes the location of installed packages/programs. It is also used
# # for reading program specifications.
# install_path: "${path.data}/install"
# # retry_sleep_init_duration is the duration to sleep for before the first retry attempt. This
# # duration will increase for subsequent retry attempts in a randomized exponential backoff manner.
# retry_sleep_init_duration: 30s

# agent.process:
# # timeout for creating new processes. when process is not successfully created by this timeout
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ require (
github.com/billgraziano/dpapi v0.4.0
github.com/blakesmith/ar v0.0.0-20150311145944-8bd4349a67f2
github.com/cavaliercoder/go-rpm v0.0.0-20190131055624-7a9c54e3d83e
github.com/cenkalti/backoff/v4 v4.1.2
github.com/coreos/go-systemd/v22 v22.3.3-0.20220203105225-a9a7ef127534
github.com/docker/go-units v0.5.0
github.com/dolmen-go/contextio v0.0.0-20200217195037-68fc5150bcd5
Expand Down Expand Up @@ -73,7 +74,6 @@ require (
github.com/akavel/rsrc v0.8.0 // indirect
github.com/armon/go-radix v1.0.0 // indirect
github.com/cavaliercoder/badio v0.0.0-20160213150051-ce5280129e9e // indirect
github.com/cenkalti/backoff/v4 v4.1.2 // indirect
github.com/cyphar/filepath-securejoin v0.2.3 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/dnephin/pflag v1.0.7 // indirect
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ output_permissions:
agent:
download:
sourceURI: 'https://artifacts.elastic.co/downloads/'
retry_sleep_init_duration: 30s
monitoring:
enabled: true
use_output: default
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
agent:
download:
sourceURI: https://artifacts.elastic.co/downloads/
retry_sleep_init_duration: 30s
features:
fqdn:
enabled: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ agent:
installPath: install
dropPath: ""
timeout: 2h0m0s
retry_sleep_init_duration: 30s
process:
spawn_timeout: 30s
stop_timeout: 30s
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
agent:
download:
sourceURI: https://artifacts.elastic.co/downloads/
retry_sleep_init_duration: 30s
features:
fqdn:
enabled: true
Expand Down
13 changes: 9 additions & 4 deletions internal/pkg/agent/application/upgrade/artifact/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ type Config struct {
// If not provided FileSystem Downloader will fallback to /beats subfolder of elastic-agent directory.
DropPath string `yaml:"dropPath" config:"drop_path"`

// RetrySleepInitDuration: the duration to sleep for before the first retry attempt. This duration
// will increase for subsequent retry attempts in a randomized exponential backoff manner.
RetrySleepInitDuration time.Duration `yaml:"retry_sleep_init_duration" config:"retry_sleep_init_duration"`

httpcommon.HTTPTransportSettings `config:",inline" yaml:",inline"` // Note: use anonymous struct for json inline
}

Expand Down Expand Up @@ -157,10 +161,11 @@ func DefaultConfig() *Config {
transport.Timeout = 120 * time.Minute

return &Config{
SourceURI: DefaultSourceURI,
TargetDirectory: paths.Downloads(),
InstallPath: paths.Install(),
HTTPTransportSettings: transport,
SourceURI: DefaultSourceURI,
TargetDirectory: paths.Downloads(),
InstallPath: paths.Install(),
RetrySleepInitDuration: 30 * time.Second,
HTTPTransportSettings: transport,
}
}

Expand Down
57 changes: 50 additions & 7 deletions internal/pkg/agent/application/upgrade/step_download.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ import (
"fmt"
"os"
"strings"
"time"

"github.com/cenkalti/backoff/v4"

"go.elastic.co/apm"

Expand Down Expand Up @@ -47,18 +50,13 @@ func (u *Upgrader) downloadArtifact(ctx context.Context, version, sourceURI stri
"source_uri", settings.SourceURI, "drop_path", settings.DropPath,
"target_path", settings.TargetDirectory, "install_path", settings.InstallPath)

fetcher, err := newDownloader(version, u.log, &settings)
if err != nil {
return "", errors.New(err, "initiating fetcher")
}

if err := os.MkdirAll(paths.Downloads(), 0750); err != nil {
return "", errors.New(err, fmt.Sprintf("failed to create download directory at %s", paths.Downloads()))
}

path, err := fetcher.Download(ctx, agentArtifact, version)
path, err := u.downloadWithRetries(ctx, newDownloader, version, &settings)
if err != nil {
return "", errors.New(err, "failed upgrade of agent binary")
return "", err
}

if skipVerifyOverride {
Expand Down Expand Up @@ -119,3 +117,48 @@ func newVerifier(version string, log *logger.Logger, settings *artifact.Config)

return composed.NewVerifier(fsVerifier, snapshotVerifier, remoteVerifier), nil
}

func (u *Upgrader) downloadWithRetries(
ctx context.Context,
downloaderCtor func(string, *logger.Logger, *artifact.Config) (download.Downloader, error),
version string,
settings *artifact.Config,
) (string, error) {
cancelCtx, cancel := context.WithTimeout(ctx, settings.Timeout)
defer cancel()

expBo := backoff.NewExponentialBackOff()
expBo.InitialInterval = settings.RetrySleepInitDuration
boCtx := backoff.WithContext(expBo, cancelCtx)

var path string
var attempt uint

opFn := func() error {
attempt++
u.log.Debugf("download attempt %d", attempt)

downloader, err := downloaderCtor(version, u.log, settings)
if err != nil {
return fmt.Errorf("unable to create fetcher: %w", err)
}

path, err = downloader.Download(cancelCtx, agentArtifact, version)
if err != nil {
return fmt.Errorf("unable to download package: %w", err)
}

// Download successful
return nil
}

opFailureNotificationFn := func(err error, retryAfter time.Duration) {
u.log.Warnf("%s; retrying (will be retry %d) in %s.", err.Error(), attempt, retryAfter)
}

if err := backoff.RetryNotify(opFn, boCtx, opFailureNotificationFn); err != nil {
return "", err
}

return path, nil
}
Loading

0 comments on commit 695bd42

Please sign in to comment.