diff --git a/pkg/session/model/cmd/main.go b/pkg/session/model/cmd/main.go new file mode 100644 index 00000000..af25c843 --- /dev/null +++ b/pkg/session/model/cmd/main.go @@ -0,0 +1,182 @@ +package main + +import ( + "fmt" + "math/rand" + "os" + "strconv" + "time" + + "github.com/dustin/go-humanize" + "github.com/filecoin-project/lassie/pkg/session" + "github.com/filecoin-project/lassie/pkg/session/model" + "github.com/multiformats/go-multicodec" +) + +var ( + GRAPHSYNC_FAST_RELIABLE_LOTS_OF_POPULAR_DATA = model.Provider{ + Name: "graphsync fast, semi-reliable, lots of popular data", + Probabilities: map[multicodec.Code]model.Probabilities{ + multicodec.TransportGraphsyncFilecoinv1: { + Candidate: model.Chance(0.5), + Success: model.Chance(0.6), + ConnectTimeMs: model.ProbDist{StdDev: 6, Mean: 10}, + TimeToFirstByteMs: model.ProbDist{StdDev: 6, Mean: 10}, + BandwidthBps: model.ProbDist{StdDev: 1e6, Mean: 1e8}, // Mean of 100Mb/s +/- 1MB/s + LatencyMs: model.ProbDist{StdDev: 1, Mean: 20}, + FastRetrieval: model.Chance(0.9), + Verified: model.Chance(0.9), + }, + }, + } + + GRAPHSYNC_MEDIUM_RELIABLE_SOME_POPULAR_DATA = model.Provider{ + Name: "graphsync medium, semi-reliable, some popular data", + Probabilities: map[multicodec.Code]model.Probabilities{ + multicodec.TransportGraphsyncFilecoinv1: { + Candidate: model.Chance(0.3), + Success: model.Chance(0.5), + ConnectTimeMs: model.ProbDist{StdDev: 6, Mean: 50}, + TimeToFirstByteMs: model.ProbDist{StdDev: 10, Mean: 25}, + BandwidthBps: model.ProbDist{StdDev: 1e6, Mean: 1e7}, // Mean of 10MB/s +/- 1MB/s + LatencyMs: model.ProbDist{StdDev: 10, Mean: 40}, + FastRetrieval: model.Chance(0.9), + Verified: model.Chance(0.9), + }, + }, + } + + GRAPHSYNC_MEDIUM_RELIABLE_MINIMAL_POPULAR_DATA = model.Provider{ + Name: "graphsync medium, semi-reliable, minimal popular data", + Probabilities: map[multicodec.Code]model.Probabilities{ + multicodec.TransportGraphsyncFilecoinv1: { + Candidate: model.Chance(0.1), + Success: model.Chance(0.5), + ConnectTimeMs: model.ProbDist{StdDev: 6, Mean: 50}, + TimeToFirstByteMs: model.ProbDist{StdDev: 10, Mean: 25}, + BandwidthBps: model.ProbDist{StdDev: 1e6, Mean: 1e7}, // Mean of 10MB/s +/- 1MB/s + LatencyMs: model.ProbDist{StdDev: 10, Mean: 40}, + FastRetrieval: model.Chance(0.9), + Verified: model.Chance(0.9), + }, + }, + } + + GRAPHSYNC_MEDIUM_UNRELIABLE_SOME_POPULAR_DATA = model.Provider{ + Name: "graphsync medium, unreliable, some popular data", + Probabilities: map[multicodec.Code]model.Probabilities{ + multicodec.TransportGraphsyncFilecoinv1: { + Candidate: model.Chance(0.3), + Success: model.Chance(0.3), + ConnectTimeMs: model.ProbDist{StdDev: 6, Mean: 50}, + TimeToFirstByteMs: model.ProbDist{StdDev: 20, Mean: 50}, + BandwidthBps: model.ProbDist{StdDev: 1e5, Mean: 1e6}, // Mean of 1MB/s +/- 100KB/s + LatencyMs: model.ProbDist{StdDev: 10, Mean: 40}, + FastRetrieval: model.Chance(0.5), + Verified: model.Chance(0.5), + }, + }, + } + + GRAPHSYNC_MEDIUM_VERY_UNRELIABLE_SOME_POPULAR_DATA = model.Provider{ + Name: "graphsync medium, very unreliable, some popular data", + Probabilities: map[multicodec.Code]model.Probabilities{ + multicodec.TransportGraphsyncFilecoinv1: { + Candidate: model.Chance(0.3), + Success: model.Chance(0.1), + ConnectTimeMs: model.ProbDist{StdDev: 100, Mean: 200}, + TimeToFirstByteMs: model.ProbDist{StdDev: 6, Mean: 100}, + BandwidthBps: model.ProbDist{StdDev: 1e5, Mean: 1e6}, // Mean of 1MB/s +/- 100KB/s + LatencyMs: model.ProbDist{StdDev: 10, Mean: 100}, + FastRetrieval: model.Chance(0.2), + Verified: model.Chance(0.2), + }, + }, + } + + HTTP_FAST_SEMIRELIABLE_LOTS_OF_POPULAR_DATA = model.Provider{ + Name: "http fast, semi-reliable, lots of popular data", // e-ipfs? + Probabilities: map[multicodec.Code]model.Probabilities{ + multicodec.TransportIpfsGatewayHttp: { + Candidate: model.Chance(0.5), + Success: model.Chance(0.5), + ConnectTimeMs: model.ProbDist{StdDev: 0, Mean: 0}, + TimeToFirstByteMs: model.ProbDist{StdDev: 6, Mean: 10}, + BandwidthBps: model.ProbDist{StdDev: 1e6, Mean: 1e8}, // Mean of 100Mb/s +/- 1MB/s + LatencyMs: model.ProbDist{StdDev: 1, Mean: 20}, + }, + }, + } + + HTTP_MEDIUM_FLAKY_SOME_POPULAR_DATA = model.Provider{ + Name: "http medium, semi-reliable, lots of popular data", // e-ipfs? + Probabilities: map[multicodec.Code]model.Probabilities{ + multicodec.TransportIpfsGatewayHttp: { + Candidate: model.Chance(0.7), + Success: model.Chance(0.6), + ConnectTimeMs: model.ProbDist{StdDev: 0, Mean: 0}, + TimeToFirstByteMs: model.ProbDist{StdDev: 6, Mean: 10}, + BandwidthBps: model.ProbDist{StdDev: 1e6, Mean: 1e7}, // Mean of 10MB/s +/- 1MB/s + LatencyMs: model.ProbDist{StdDev: 10, Mean: 40}, + }, + }, + } +) + +func main() { + seed := time.Now().UnixNano() + switch len(os.Args) { + case 1: + case 2: + // first arg is a seed if it's a number + if s, err := strconv.ParseInt(os.Args[1], 10, 64); err == nil { + seed = s + } else { + fmt.Println("Usage: go run main.go [seed]") + os.Exit(1) + } + default: + fmt.Println("Usage: go run main.go [seed]") + os.Exit(1) + } + + simRand := rand.New(rand.NewSource(seed)) + + // TODO: generate static population up-front with fixed characteristics + pop := &model.Population{} + pop.Add(GRAPHSYNC_FAST_RELIABLE_LOTS_OF_POPULAR_DATA, 4) + pop.Add(GRAPHSYNC_MEDIUM_RELIABLE_SOME_POPULAR_DATA, 20) + pop.Add(GRAPHSYNC_MEDIUM_UNRELIABLE_SOME_POPULAR_DATA, 20) + pop.Add(GRAPHSYNC_MEDIUM_RELIABLE_MINIMAL_POPULAR_DATA, 50) + pop.Add(HTTP_FAST_SEMIRELIABLE_LOTS_OF_POPULAR_DATA, 1) + + sim := model.Simulation{ + Population: pop, + Retrievals: 50000, + RetrievalSize: model.ProbDist{StdDev: 2e7, Mean: 1e7}, // Mean of 20MB +/- 10MB + HttpChance: model.Chance(0.5), + GraphsyncChance: model.Chance(0.5), + } + + ret := sim.Run(simRand) + cfg := session.DefaultConfig() + cfg.Random = simRand + ses := session.NewSession(cfg, true) + res := ret.RunWith(simRand, ses) + + fmt.Println("---------------------------------------------------------------") + fmt.Println("Simulation of of", len(ret), "retrievals, seed:", seed) + fmt.Println() + fmt.Printf("\t Size per retrieval: %s < %s < %s\n", humanize.IBytes(uint64(ret.MinSize())), humanize.IBytes(uint64(ret.AvgSize())), humanize.IBytes(uint64(ret.MaxSize()))) + fmt.Printf("\tCandidate per retrieval: %s < %s < %s\n", humanize.Comma(int64(ret.MinCandidateCount())), humanize.Comma(int64(ret.AvgCandidateCount())), humanize.Comma(int64(ret.MaxCandidateCount()))) + fmt.Println("---------------------------------------------------------------") + fmt.Printf("\t Runs: %d\n", res.Runs) + fmt.Printf("\t Successes: %d\n", res.Successes) + fmt.Printf("\t Retrieval failures: %d\n", res.RetrievalFailures) + fmt.Printf("\t Size: %s\n", humanize.IBytes(uint64(res.Size))) + fmt.Printf("\t Total time: %v\n", time.Duration(res.TotalTimeMs)*time.Millisecond) + fmt.Printf("\t Average TTFB: %s\n", time.Duration(res.AverageTimeToFirstByteMs)*time.Millisecond) + fmt.Printf("\t Average bandwidth: %s/s\n", humanize.IBytes(uint64(res.AverageBandwidth))) + fmt.Printf("\t Total bandwidth: %s/s\n", humanize.IBytes(uint64(res.Size)/uint64(res.TotalTimeMs/1000))) + fmt.Println("---------------------------------------------------------------") +} diff --git a/pkg/session/model/pop.go b/pkg/session/model/pop.go new file mode 100644 index 00000000..f2f3a6a4 --- /dev/null +++ b/pkg/session/model/pop.go @@ -0,0 +1,24 @@ +package model + +import "github.com/multiformats/go-multicodec" + +type Provider struct { + Name string + Probabilities map[multicodec.Code]Probabilities +} + +type Population struct { + Providers []PC +} + +type PC struct { + Provider Provider + Count int +} + +func (p *Population) Add(provider Provider, count int) { + if p.Providers == nil { + p.Providers = make([]PC, 0) + } + p.Providers = append(p.Providers, PC{Provider: provider, Count: count}) +} diff --git a/pkg/session/model/prob.go b/pkg/session/model/prob.go new file mode 100644 index 00000000..34067958 --- /dev/null +++ b/pkg/session/model/prob.go @@ -0,0 +1,45 @@ +package model + +import "math/rand" + +// Probabilities defines the probabilistic behaviour of a provider for a +// particular protocol +type Probabilities struct { + // Probability of being a candidate for any given retrieval [0,1] + Candidate Chance + // Probability of a successful retrieval [0,1] + Success Chance + // Distribution for connect time in milliseconds + ConnectTimeMs ProbDist + // Distribution for time to first byte in milliseconds + TimeToFirstByteMs ProbDist + // Distribution in bandwidth in bytes per second, this has to account for + // block fetching speed on the remote, not just the pipe + BandwidthBps ProbDist + // Distribution for latency in milliseconds, this will be multiplied to + // simulate connection initialisation round-trips + LatencyMs ProbDist + // Probability of having FastRetrieval for a graphsync retrieval [0,1] + FastRetrieval Chance + // Probability of having Verified for a graphsync retrieval [0,1] + Verified Chance +} + +type ProbDist struct { + StdDev float64 + Mean float64 +} + +func (pd ProbDist) Sample(rand *rand.Rand) float64 { + return rand.NormFloat64()*pd.StdDev + pd.Mean +} + +// Chance is the probability of a Roll() being true, the higher the value in the +// range [0,1] the more likely it is to be true. +type Chance float64 + +func (c Chance) Roll(rand *rand.Rand) bool { + return rand.Float64() < float64(c) +} + +const FIFTY_FIFTY = Chance(0.5) diff --git a/pkg/session/model/sim.go b/pkg/session/model/sim.go new file mode 100644 index 00000000..8be52fdd --- /dev/null +++ b/pkg/session/model/sim.go @@ -0,0 +1,365 @@ +package model + +import ( + "fmt" + "math" + "math/rand" + "time" + + "github.com/filecoin-project/lassie/pkg/session" + "github.com/filecoin-project/lassie/pkg/types" + "github.com/ipni/go-libipni/metadata" + peer "github.com/libp2p/go-libp2p/core/peer" + "github.com/multiformats/go-multicodec" +) + +const tickIncrement = 5 * time.Millisecond + +type Simulation struct { + Population *Population + Retrievals int + RetrievalSize ProbDist + HttpChance Chance + GraphsyncChance Chance +} + +func (sim Simulation) Run(rand *rand.Rand) RetrievalSet { + retrievalSet := make(RetrievalSet, sim.Retrievals) + for i := 0; i < sim.Retrievals; i++ { + retrievalSet[i] = sim.Retrieval(rand) + } + return retrievalSet +} + +func (sim Simulation) Retrieval(rand *rand.Rand) Retrieval { + var http, gs bool + for !http && !gs { + http = sim.HttpChance.Roll(rand) + gs = sim.GraphsyncChance.Roll(rand) + } + candidates := make([]Candidate, 0) + for len(candidates) == 0 { + for _, pc := range sim.Population.Providers { + for i := pc.Count; http && i > 0; i-- { + if pc.Provider.Probabilities[multicodec.TransportIpfsGatewayHttp].Candidate.Roll(rand) { + id := peer.ID(fmt.Sprintf("cand%d", len(candidates))) + candidates = append(candidates, Candidate{id, pc.Provider, multicodec.TransportIpfsGatewayHttp}) + } + } + for i := pc.Count; gs && i > 0; i-- { + if pc.Provider.Probabilities[multicodec.TransportGraphsyncFilecoinv1].Candidate.Roll(rand) { + id := peer.ID(fmt.Sprintf("cand%d", len(candidates))) + candidates = append(candidates, Candidate{id, pc.Provider, multicodec.TransportGraphsyncFilecoinv1}) + } + } + } + } + size := int64(sim.RetrievalSize.Sample(rand)) + // minimum block size of 100KiB + if size < 102400 { + size = 102400 + } + return Retrieval{ + Candidates: candidates, + Size: size, + } +} + +type Retrieval struct { + Candidates []Candidate + Size int64 +} + +type Candidate struct { + ID peer.ID + Provider Provider + Protocol multicodec.Code +} + +type candidateRun struct { + Candidate Candidate + ConnectTimeMs int64 + TimeToFirstByteMs int64 + DurationMs int64 + Connected bool + Success bool + Metadata metadata.Protocol +} + +type protocolRun struct { + Size int64 + CandidateRuns []*candidateRun + WaitingIndexes []int + RunningIndex int + RunningFb bool + RunningFromMs int64 + FailureCount int +} + +func (pr protocolRun) Done() bool { + return pr.FailureCount >= len(pr.CandidateRuns) +} + +// An artificial retriever simulation that is quantised to `tick`, so it's not +// quite accurate but close enough to simulate a real retrieval. +func (pr *protocolRun) Tick(ses *session.Session, tick time.Duration) (bool, int64, int64) { + // check for candidates needing to connect in this window + for i, cr := range pr.CandidateRuns { + if !cr.Connected && cr.ConnectTimeMs >= 0 && cr.ConnectTimeMs <= tick.Milliseconds() { + cr.Connected = true + pr.WaitingIndexes = append(pr.WaitingIndexes, i) + ses.RecordConnectTime(cr.Candidate.ID, time.Millisecond*time.Duration(cr.ConnectTimeMs)) + } + } + // see if the current running candidate is done + if pr.RunningIndex != -1 { + cr := pr.CandidateRuns[pr.RunningIndex] + if !pr.RunningFb && pr.RunningFromMs+cr.TimeToFirstByteMs <= tick.Milliseconds() { + pr.RunningFb = true + ses.RecordFirstByteTime(cr.Candidate.ID, time.Millisecond*time.Duration(cr.TimeToFirstByteMs)) + } + if pr.RunningFromMs+cr.DurationMs <= tick.Milliseconds() { + pr.RunningIndex = -1 + pr.RunningFb = false + if cr.Success { + bandwidth := float64(pr.Size) / (float64(cr.DurationMs) / 1000) + ses.RecordSuccess(cr.Candidate.ID, uint64(bandwidth)) + return true, cr.TimeToFirstByteMs, pr.RunningFromMs + cr.DurationMs + } else { + pr.FailureCount++ + ses.RecordFailure(types.RetrievalID{}, cr.Candidate.ID) + } + } + } + // nobody running, pick next and start a retrieval + if pr.RunningIndex == -1 && len(pr.WaitingIndexes) > 0 { + var next int + if len(pr.WaitingIndexes) > 1 { + peers := make([]peer.ID, len(pr.WaitingIndexes)) + mda := make([]metadata.Protocol, len(pr.WaitingIndexes)) + for i, idx := range pr.WaitingIndexes { + peers[i] = pr.CandidateRuns[idx].Candidate.ID + mda[i] = pr.CandidateRuns[idx].Metadata + } + next = ses.ChooseNextProvider(peers, mda) + } + pr.RunningIndex = pr.WaitingIndexes[next] + pr.RunningFromMs = tick.Milliseconds() + pr.WaitingIndexes = append(pr.WaitingIndexes[:next], pr.WaitingIndexes[next+1:]...) + } + return false, 0, pr.RunningFromMs + tick.Milliseconds() +} + +var initialWait = (2 * time.Millisecond) + +// RunWith models a retrieval across multiple candidates for both protocols. +// It sets up the initial state and then ticks the simulation until it's done. +// Running the simulation in ticks makes it simpler but quantises the run, +// losing resolution that a real execution would have. +func (rs Retrieval) RunWith(rand *rand.Rand, ses *session.Session) RetrievalResult { + // setup separate runs for GS & HTTP + runs := map[multicodec.Code]*protocolRun{ + multicodec.TransportGraphsyncFilecoinv1: { + Size: rs.Size, + CandidateRuns: make([]*candidateRun, 0), + RunningIndex: -1, + }, + multicodec.TransportIpfsGatewayHttp: { + Size: rs.Size, + CandidateRuns: make([]*candidateRun, 0), + RunningIndex: -1, + }, + } + gsRun := runs[multicodec.TransportGraphsyncFilecoinv1] + httpRun := runs[multicodec.TransportIpfsGatewayHttp] + + for _, c := range rs.Candidates { + cr := &candidateRun{Candidate: c} + runs[c.Protocol].CandidateRuns = append(runs[c.Protocol].CandidateRuns, cr) + switch c.Protocol { + case multicodec.TransportGraphsyncFilecoinv1: + cr.Metadata = &metadata.GraphsyncFilecoinV1{ + VerifiedDeal: c.Provider.Probabilities[multicodec.TransportGraphsyncFilecoinv1].Verified.Roll(rand), + FastRetrieval: c.Provider.Probabilities[multicodec.TransportGraphsyncFilecoinv1].FastRetrieval.Roll(rand), + } + case multicodec.TransportIpfsGatewayHttp: + cr.Metadata = &metadata.IpfsGatewayHttp{} + } + cr.Success = c.Provider.Probabilities[c.Protocol].Success.Roll(rand) + if !cr.Success && FIFTY_FIFTY.Roll(rand) { + // If a failure, is it fail on connect or retrieve? + // Should this be a probability on the provider since it impacts connect time metric recording? + cr.ConnectTimeMs = -1 + runs[c.Protocol].FailureCount++ + ses.RecordFailure(types.RetrievalID{}, c.ID) // ignore err, not relevant + } else { // actually manages to connect + cr.ConnectTimeMs = int64(c.Provider.Probabilities[c.Protocol].ConnectTimeMs.Sample(rand)) + // figure out a reasonable(ish) bandwidth and latency, and therefore the + // real bandwidth of a potential successful transfer + bandwidth := c.Provider.Probabilities[c.Protocol].BandwidthBps.Sample(rand) + // multiply by 4 to simulate a transfer init + latency := 2 * c.Provider.Probabilities[c.Protocol].LatencyMs.Sample(rand) + cr.DurationMs = int64(latency + ((float64(rs.Size) / bandwidth) * 1000)) + if !cr.Success { + // failure at some random point in the transfer, so not full duration + cr.DurationMs = int64(rand.Float64() * float64(cr.DurationMs)) + } else { + cr.TimeToFirstByteMs = int64(latency + c.Provider.Probabilities[c.Protocol].TimeToFirstByteMs.Sample(rand)) + } + } + } + + tick := initialWait + for { + var gsSuccess, httpSuccess bool + var gsTtfbMs, httpTtfbMs, gsDurationMs, httpDurationMs int64 + if !gsRun.Done() { + gsSuccess, gsTtfbMs, gsDurationMs = gsRun.Tick(ses, tick) + } + if !httpRun.Done() { + httpSuccess, httpTtfbMs, httpDurationMs = httpRun.Tick(ses, tick) + } + if gsSuccess || httpSuccess { + if tick == initialWait { + panic("unexpected success during initialWait") + } + durationMs := gsDurationMs + ttfbMs := gsTtfbMs + if httpSuccess && (!gsSuccess || httpDurationMs < durationMs) { + durationMs = httpDurationMs + } + // ttfb could be from the opposite protocol to the successful one + if httpSuccess && (!gsSuccess || httpTtfbMs < ttfbMs) { + ttfbMs = httpTtfbMs + } + return RetrievalResult{ + RunTimeMs: durationMs, + TimeToFirstByteMs: ttfbMs, + Success: true, + Failures: gsRun.FailureCount + httpRun.FailureCount, + BandwidthBps: int64(float64(rs.Size) / (float64(durationMs) / 1000)), + } + } + if gsDurationMs == 0 && httpDurationMs == 0 || tick > 30*time.Second { + if tick > 2*time.Minute { + fmt.Print("tick > 2m, bailing out, got: ") + for _, cr := range gsRun.CandidateRuns { + fmt.Print(cr.ConnectTimeMs, "ms+", time.Duration(cr.DurationMs)*time.Millisecond, " ") + } + fmt.Println() + } + return RetrievalResult{ + RunTimeMs: tick.Milliseconds(), + Success: false, + Failures: gsRun.FailureCount + httpRun.FailureCount, + BandwidthBps: 0, + } + } + tick += tickIncrement + } +} + +type RetrievalSet []Retrieval + +func (rs RetrievalSet) AvgSize() int64 { + var total int64 = 0 + for _, r := range rs { + total += r.Size + } + return total / int64(len(rs)) +} + +func (rs RetrievalSet) MaxSize() int64 { + var max int64 = 0 + for _, r := range rs { + if r.Size > max { + max = r.Size + } + } + return max +} + +func (rs RetrievalSet) MinSize() int64 { + var min int64 = math.MaxInt64 + for _, r := range rs { + if r.Size < min { + min = r.Size + } + } + return min +} + +func (rs RetrievalSet) AvgCandidateCount() int { + total := 0 + for _, r := range rs { + total += len(r.Candidates) + } + return total / len(rs) +} + +func (rs RetrievalSet) MaxCandidateCount() int { + max := 0 + for _, r := range rs { + if len(r.Candidates) > max { + max = len(r.Candidates) + } + } + return max +} + +func (rs RetrievalSet) MinCandidateCount() int { + min := math.MaxInt64 + for _, r := range rs { + if len(r.Candidates) < min { + min = len(r.Candidates) + } + } + return min +} + +// RunWith iterates through the retrievals and executes them, collecting and +// returning summary data. +func (rs RetrievalSet) RunWith(rand *rand.Rand, ses *session.Session) Result { + results := make([]RetrievalResult, len(rs)) + var successes, retrievalFailures int + var size, totalTimeMs, totalBandwidthBps, totalTtfbMs int64 + + for i, r := range rs { + results[i] = r.RunWith(rand, ses) + size += int64(r.Size) + retrievalFailures += results[i].Failures + totalTimeMs += results[i].RunTimeMs + if results[i].Success { + successes++ + totalBandwidthBps += results[i].BandwidthBps + totalTtfbMs += results[i].TimeToFirstByteMs + } + } + return Result{ + Runs: len(rs), + Successes: successes, + RetrievalFailures: retrievalFailures, + Size: size, + TotalTimeMs: totalTimeMs, + AverageTimeToFirstByteMs: totalTtfbMs / int64(successes), + AverageBandwidth: totalBandwidthBps / int64(successes), + } +} + +type RetrievalResult struct { + RunTimeMs int64 + TimeToFirstByteMs int64 + Success bool + Failures int + BandwidthBps int64 +} + +type Result struct { + Runs int + Successes int + RetrievalFailures int // total individual failures, failures across all retrievals + Size int64 + TotalTimeMs int64 + AverageTimeToFirstByteMs int64 + AverageBandwidth int64 +} diff --git a/pkg/session/state.go b/pkg/session/state.go index 0b78ebac..5ba5e7fd 100644 --- a/pkg/session/state.go +++ b/pkg/session/state.go @@ -216,12 +216,13 @@ func (spt *SessionState) RecordFailure(retrievalId types.RetrievalID, storagePro spt.lk.Lock() defer spt.lk.Unlock() + spt.recordSuccessMetric(storageProviderId, 0) + // remove from this retrieval to free up the SP to be tried again for a future retrieval if err := spt.removeFromRetrieval(retrievalId, storageProviderId); err != nil { return err } - spt.recordSuccessMetric(storageProviderId, 0) return nil } @@ -329,6 +330,14 @@ func (spt *SessionState) ChooseNextProvider(peers []peer.ID, mda []metadata.Prot // choose a random peer, weighted by score r := tot * spt.config.roll() + /* + sb := strings.Builder{} + for _, pi := range ind { + sb.WriteString(fmt.Sprintf("%s(#%d): %f, ", string(peers[pi]), pi, scores[pi])) + } + sb.WriteString(fmt.Sprintf("with roll of %f", r)) + fmt.Println(sb.String()) + */ for _, pi := range ind { s := scores[pi] if r <= s { @@ -339,7 +348,7 @@ func (spt *SessionState) ChooseNextProvider(peers []peer.ID, mda []metadata.Prot sb := strings.Builder{} sb.WriteString("internal error - failed to choose a provider from: ") for _, pi := range ind { - sb.WriteString(fmt.Sprintf("%s: %f, ", peers[pi], scores[pi])) + sb.WriteString(fmt.Sprintf("%s: %f, ", peers[pi].String(), scores[pi])) } sb.WriteString(fmt.Sprintf("with roll of %f", r)) panic(sb.String())