Skip to content

Commit

Permalink
Speculative query execution (apache#1178)
Browse files Browse the repository at this point in the history
* Rework metrics locking

* Metrics are now split into:
     hostMetrics - for a list of metrics
     queryMetrics - for a map and a locker
* Added functions to perform locked metrics updates/reads
* Locking is private for the metrics only, so should have no
performance effects.

Signed-off-by: Alex Lourie <alex@instaclustr.com>

* Introduce Speculative Policy

* Define the speculative policy
* Add NonSpeculative policy
* Add SimpleSpeculative policy

Signed-off-by: Alex Lourie <alex@instaclustr.com>

* Add IsIdempotent to ExecutableQuery interface

Signed-off-by: Alex Lourie <alex@instaclustr.com>

* Implement speculative execution

* Refactor executeQuery to execute main code in a separate goroutine
* Handle speculative/non-speculative cases separately
* Add TestSpeculativeExecution test

Signed-off-by: Alex Lourie <alex@instaclustr.com>

* Review comments

* Make one code path for all executions
* Simplify the results handling
* Update the tests

Signed-off-by: Alex Lourie <alex@instaclustr.com>

* More review comments

* Metric lock improvements
* Style cleanups

Signed-off-by: Alex Lourie <alex@instaclustr.com>

* Fix Latency calc lock

Signed-off-by: Alex Lourie <alex@instaclustr.com>

* Fix session.go for new metrics

Signed-off-by: Alex Lourie <alex@instaclustr.com>
  • Loading branch information
alourie authored and Zariel committed Oct 9, 2018
1 parent 39a77fa commit aa46e85
Show file tree
Hide file tree
Showing 7 changed files with 374 additions and 100 deletions.
109 changes: 102 additions & 7 deletions conn_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ import (
"fmt"
"io"
"io/ioutil"
"math/rand"
"net"
"os"
"strings"
"sync"
"sync/atomic"
Expand Down Expand Up @@ -44,8 +46,8 @@ func TestApprove(t *testing.T) {

func TestJoinHostPort(t *testing.T) {
tests := map[string]string{
"127.0.0.1:0": JoinHostPort("127.0.0.1", 0),
"127.0.0.1:1": JoinHostPort("127.0.0.1:1", 9142),
"127.0.0.1:0": JoinHostPort("127.0.0.1", 0),
"127.0.0.1:1": JoinHostPort("127.0.0.1:1", 9142),
"[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:0": JoinHostPort("2001:0db8:85a3:0000:0000:8a2e:0370:7334", 0),
"[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:1": JoinHostPort("[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:1", 9142),
}
Expand Down Expand Up @@ -316,7 +318,7 @@ func TestCancel(t *testing.T) {
}

type testQueryObserver struct {
metrics map[string]*queryMetrics
metrics map[string]*hostMetrics
verbose bool
}

Expand All @@ -329,7 +331,7 @@ func (o *testQueryObserver) ObserveQuery(ctx context.Context, q ObservedQuery) {
}
}

func (o *testQueryObserver) GetMetrics(host *HostInfo) *queryMetrics {
func (o *testQueryObserver) GetMetrics(host *HostInfo) *hostMetrics {
return o.metrics[host.ConnectAddress().String()]
}

Expand Down Expand Up @@ -377,6 +379,12 @@ func TestQueryRetry(t *testing.T) {
}

func TestQueryMultinodeWithMetrics(t *testing.T) {
log := &testLogger{}
Logger = log
defer func() {
Logger = &defaultLogger{}
os.Stdout.WriteString(log.String())
}()

// Build a 3 node cluster to test host metric mapping
var nodes []*TestServer
Expand All @@ -401,18 +409,19 @@ func TestQueryMultinodeWithMetrics(t *testing.T) {

// 1 retry per host
rt := &SimpleRetryPolicy{NumRetries: 3}
observer := &testQueryObserver{metrics: make(map[string]*queryMetrics), verbose: false}
observer := &testQueryObserver{metrics: make(map[string]*hostMetrics), verbose: false}
qry := db.Query("kill").RetryPolicy(rt).Observer(observer)
if err := qry.Exec(); err == nil {
t.Fatalf("expected error")
}

for i, ip := range addresses {
host := &HostInfo{connectAddress: net.ParseIP(ip)}
queryMetric := qry.getHostMetrics(host)
observedMetrics := observer.GetMetrics(host)

requests := int(atomic.LoadInt64(&nodes[i].nKillReq))
hostAttempts := qry.metrics[ip].Attempts
hostAttempts := queryMetric.Attempts
if requests != hostAttempts {
t.Fatalf("expected requests %v to match query attempts %v", requests, hostAttempts)
}
Expand All @@ -421,7 +430,7 @@ func TestQueryMultinodeWithMetrics(t *testing.T) {
t.Fatalf("expected observed attempts %v to match query attempts %v on host %v", observedMetrics.Attempts, hostAttempts, ip)
}

hostLatency := qry.metrics[ip].TotalLatency
hostLatency := queryMetric.TotalLatency
observedLatency := observedMetrics.TotalLatency
if hostLatency != observedLatency {
t.Fatalf("expected observed latency %v to match query latency %v on host %v", observedLatency, hostLatency, ip)
Expand All @@ -435,6 +444,79 @@ func TestQueryMultinodeWithMetrics(t *testing.T) {

}

type testRetryPolicy struct {
NumRetries int
}

func (t *testRetryPolicy) Attempt(qry RetryableQuery) bool {
return qry.Attempts() <= t.NumRetries
}
func (t *testRetryPolicy) GetRetryType(err error) RetryType {
return Retry
}

func TestSpeculativeExecution(t *testing.T) {
log := &testLogger{}
Logger = log
defer func() {
Logger = &defaultLogger{}
os.Stdout.WriteString(log.String())
}()

// Build a 3 node cluster
var nodes []*TestServer
var addresses = []string{
"127.0.0.1",
"127.0.0.2",
"127.0.0.3",
}
// Can do with 1 context for all servers
ctx := context.Background()
for _, ip := range addresses {
srv := NewTestServerWithAddress(ip+":0", t, defaultProto, ctx)
defer srv.Stop()
nodes = append(nodes, srv)
}

db, err := newTestSession(defaultProto, nodes[0].Address, nodes[1].Address, nodes[2].Address)
if err != nil {
t.Fatalf("NewCluster: %v", err)
}
defer db.Close()

// Create a test retry policy, 6 retries will cover 2 executions
rt := &testRetryPolicy{NumRetries: 8}
// test Speculative policy with 1 additional execution
sp := &SimpleSpeculativeExecution{NumAttempts: 1, TimeoutDelay: 200 * time.Millisecond}

// Build the query
qry := db.Query("speculative").RetryPolicy(rt).SetSpeculativeExecutionPolicy(sp).Idempotent(true)

// Execute the query and close, check that it doesn't error out
if err := qry.Exec(); err != nil {
t.Errorf("The query failed with '%v'!\n", err)
}
requests1 := atomic.LoadInt64(&nodes[0].nKillReq)
requests2 := atomic.LoadInt64(&nodes[1].nKillReq)
requests3 := atomic.LoadInt64(&nodes[2].nKillReq)

// Spec Attempts == 1, so expecting to see only 1 regular + 1 speculative = 2 nodes attempted
if requests1 != 0 && requests2 != 0 && requests3 != 0 {
t.Error("error: all 3 nodes were attempted, should have been only 2")
}

// Only the 4th request will generate results, so
if requests1 != 4 && requests2 != 4 && requests3 != 4 {
t.Error("error: none of 3 nodes was attempted 4 times!")
}

// "speculative" query will succeed on one arbitrary node after 4 attempts, so
// expecting to see 4 (on successful node) + not more than 2 (as cancelled on another node) == 6
if requests1+requests2+requests3 > 6 {
t.Errorf("error: expected to see 6 attempts, got %v\n", requests1+requests2+requests3)
}
}

func TestStreams_Protocol1(t *testing.T) {
srv := NewTestServer(t, protoVersion1, context.Background())
defer srv.Stop()
Expand Down Expand Up @@ -1107,6 +1189,19 @@ func (srv *TestServer) process(f *framer) {
}
}()
return
case "speculative":
atomic.AddInt64(&srv.nKillReq, 1)
if atomic.LoadInt64(&srv.nKillReq) > 3 {
f.writeHeader(0, opResult, head.stream)
f.writeInt(resultKindVoid)
f.writeString("speculative query success on the node " + srv.Address)
} else {
f.writeHeader(0, opError, head.stream)
f.writeInt(0x1001)
f.writeString("speculative error")
rand.Seed(time.Now().UnixNano())
<-time.After(time.Millisecond * 120)
}
default:
f.writeHeader(0, opResult, head.stream)
f.writeInt(resultKindVoid)
Expand Down
3 changes: 1 addition & 2 deletions control.go
Original file line number Diff line number Diff line change
Expand Up @@ -453,8 +453,7 @@ func (c *controlConn) query(statement string, values ...interface{}) (iter *Iter
Logger.Printf("control: error executing %q: %v\n", statement, iter.err)
}

metric := q.getHostMetrics(c.getConn().host)
metric.Attempts++
q.AddAttempts(1, c.getConn().host)
if iter.err == nil || !c.retry.Attempt(q) {
break
}
Expand Down
25 changes: 25 additions & 0 deletions policies.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
package gocql

import (
"context"
"errors"
"fmt"
"math"
"math/rand"
Expand Down Expand Up @@ -130,6 +132,7 @@ type RetryableQuery interface {
Attempts() int
SetConsistency(c Consistency)
GetConsistency() Consistency
GetContext() context.Context
}

type RetryType uint16
Expand All @@ -141,6 +144,10 @@ const (
Rethrow RetryType = 0x03 // raise error and stop retrying
)

// ErrUnknownRetryType is returned if the retry policy returns a retry type
// unknown to the query executor.
var ErrUnknownRetryType = errors.New("unknown retry type returned by retry policy")

// RetryPolicy interface is used by gocql to determine if a query can be attempted
// again after a retryable error has been received. The interface allows gocql
// users to implement their own logic to determine if a query can be attempted
Expand Down Expand Up @@ -852,3 +859,21 @@ func (e *ExponentialReconnectionPolicy) GetInterval(currentRetry int) time.Durat
func (e *ExponentialReconnectionPolicy) GetMaxRetries() int {
return e.MaxRetries
}

type SpeculativeExecutionPolicy interface {
Attempts() int
Delay() time.Duration
}

type NonSpeculativeExecution struct{}

func (sp NonSpeculativeExecution) Attempts() int { return 0 } // No additional attempts
func (sp NonSpeculativeExecution) Delay() time.Duration { return 1 } // The delay. Must be positive to be used in a ticker.

type SimpleSpeculativeExecution struct {
NumAttempts int
TimeoutDelay time.Duration
}

func (sp *SimpleSpeculativeExecution) Attempts() int { return sp.NumAttempts }
func (sp *SimpleSpeculativeExecution) Delay() time.Duration { return sp.TimeoutDelay }
8 changes: 4 additions & 4 deletions policies_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -263,9 +263,9 @@ func TestSimpleRetryPolicy(t *testing.T) {
{5, false},
}

q.metrics = make(map[string]*queryMetrics)
q.metrics = &queryMetrics{m: make(map[string]*hostMetrics)}
for _, c := range cases {
q.metrics["127.0.0.1"] = &queryMetrics{Attempts: c.attempts}
q.metrics.m["127.0.0.1"] = &hostMetrics{Attempts: c.attempts}
if c.allow && !rt.Attempt(q) {
t.Fatalf("should allow retry after %d attempts", c.attempts)
}
Expand Down Expand Up @@ -348,9 +348,9 @@ func TestDowngradingConsistencyRetryPolicy(t *testing.T) {
{16, false, reu1, Retry},
}

q.metrics = make(map[string]*queryMetrics)
q.metrics = &queryMetrics{m: make(map[string]*hostMetrics)}
for _, c := range cases {
q.metrics["127.0.0.1"] = &queryMetrics{Attempts: c.attempts}
q.metrics.m["127.0.0.1"] = &hostMetrics{Attempts: c.attempts}
if c.retryType != rt.GetRetryType(c.err) {
t.Fatalf("retry type should be %v", c.retryType)
}
Expand Down
Loading

0 comments on commit aa46e85

Please sign in to comment.