Skip to content

Commit

Permalink
Merge pull request #1415 from bnb-chain/fix-metric-sp-qa
Browse files Browse the repository at this point in the history
fix: check sp health retry
  • Loading branch information
ruojunm authored Jun 4, 2024
2 parents 3a6570c + 22aa663 commit f420d9a
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 20 deletions.
55 changes: 35 additions & 20 deletions base/gfspvgmgr/virtual_group_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"math/rand"
"net/http"
"sort"
"strconv"
"strings"
"sync"
"time"
Expand All @@ -19,6 +20,7 @@ import (
"github.com/bnb-chain/greenfield-storage-provider/core/consensus"
"github.com/bnb-chain/greenfield-storage-provider/core/vgmgr"
"github.com/bnb-chain/greenfield-storage-provider/pkg/log"
"github.com/bnb-chain/greenfield-storage-provider/pkg/metrics"
"github.com/bnb-chain/greenfield-storage-provider/util"
sptypes "github.com/bnb-chain/greenfield/x/sp/types"
virtualgrouptypes "github.com/bnb-chain/greenfield/x/virtualgroup/types"
Expand All @@ -33,9 +35,11 @@ const (
DefaultInitialGVGStakingStorageSize = uint64(2) * 1024 * 1024 * 1024 * 1024 // 2TB per GVG, chain side DefaultMaxStoreSizePerFamily is 64 TB
additionalGVGStakingStorageSize = uint64(1) * 1024 * 1024 * 1024 * 1024 // 1TB

defaultSPCheckTimeout = 3 * time.Second
defaultSPHealthCheckerInterval = 10 * time.Second
httpStatusPath = "/status"
defaultSPCheckTimeout = 1 * time.Minute
defaultSPHealthCheckerInterval = 10 * time.Second
defaultSPHealthCheckerRetryInterval = 1 * time.Second
defaultSPHealthCheckerMaxRetries = 5
httpStatusPath = "/status"

emptyGVGSafeDeletePeriod = int64(60) * 60 * 24
)
Expand Down Expand Up @@ -773,7 +777,7 @@ func (checker *HealthChecker) checkAllSPHealth() {

func (checker *HealthChecker) checkSPHealth(sp *sptypes.StorageProvider) bool {
if !sp.IsInService() {
log.CtxInfow(context.Background(), "the sp is not in service,sp is treated as unhealthy", "sp", sp)
log.CtxInfow(context.Background(), "the sp is not in service, sp is treated as unhealthy", "sp", sp)
return false
}

Expand All @@ -785,30 +789,41 @@ func (checker *HealthChecker) checkSPHealth(sp *sptypes.StorageProvider) bool {
Transport: &http.Transport{
TLSClientConfig: &tls.Config{MinVersion: tls.VersionTLS12},
},
Timeout: defaultSPCheckTimeout * time.Second,
Timeout: defaultSPCheckTimeout,
}

// Create an HTTP request to test the validity of the endpoint
urlToCheck := fmt.Sprintf("%s%s", endpoint, httpStatusPath)
req, err := http.NewRequestWithContext(ctxTimeout, http.MethodGet, urlToCheck, nil)
if err != nil {
return false
}
for attempt := 0; attempt < defaultSPHealthCheckerMaxRetries; attempt++ {
start := time.Now()
req, err := http.NewRequestWithContext(ctxTimeout, http.MethodGet, urlToCheck, nil)
if err != nil {
log.CtxErrorw(context.Background(), "failed to create request", "sp", sp, "error", err)
return false
}

resp, err := client.Do(req)
if err != nil {
log.CtxErrorw(context.Background(), "failed to connect to sp", "sp", sp, "error", err)
return false
}
defer resp.Body.Close()
resp, err := client.Do(req)
duration := time.Since(start)
metrics.SPHealthCheckerTime.WithLabelValues(strconv.Itoa(int(sp.Id))).Observe(duration.Seconds())
if err != nil {
log.CtxErrorw(context.Background(), "failed to connect to sp", "sp", sp, "error", err, "duration", duration)
time.Sleep(defaultSPHealthCheckerRetryInterval)
continue
}
defer resp.Body.Close()

if resp.StatusCode != http.StatusOK {
log.CtxErrorw(context.Background(), "failed to check sp healthy", "sp", sp, "http_status_code", resp.StatusCode, "resp_body", resp.Body)
return false
if resp.StatusCode == http.StatusOK {
log.CtxInfow(context.Background(), "succeed to check the sp healthy", "sp", sp, "duration", duration)
return true
} else {
metrics.SPHealthCheckerFailureCounter.WithLabelValues(strconv.Itoa(int(sp.Id))).Inc()
log.CtxErrorw(context.Background(), "failed to check sp healthy", "sp", sp, "http_status_code", resp.StatusCode, "duration", duration)
time.Sleep(defaultSPHealthCheckerRetryInterval)
}
}

log.CtxInfow(context.Background(), "succeed to check the sp healthy", "sp", sp)
return true
log.CtxErrorw(context.Background(), "failed to check sp healthy after retries", "sp", sp)
return false
}

func (checker *HealthChecker) Start() {
Expand Down
16 changes: 16 additions & 0 deletions pkg/metrics/metric_items.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ var MetricsItems = []prometheus.Collector{
ManagerCounter,
ManagerTime,
GCBlockNumberGauge,
SPHealthCheckerTime,
SPHealthCheckerFailureCounter,

// workflow metrics category
PerfApprovalTime,
Expand Down Expand Up @@ -247,6 +249,20 @@ var (
Name: "gc_block_number",
Help: "Track the next gc block number.",
}, []string{"gc_block_number"})
SPHealthCheckerTime = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "sp_health_checker_request_time",
Help: "Request duration in seconds.",
},
[]string{"sp_id"},
)
SPHealthCheckerFailureCounter = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "sp_health_checker_request_counter",
Help: "Request failure count.",
},
[]string{"sp_id"},
)
)

// workflow metrics items
Expand Down

0 comments on commit f420d9a

Please sign in to comment.