Skip to content

Commit

Permalink
reflow: include create time in alloc metadata; store inspect output i…
Browse files Browse the repository at this point in the history
…n runs database

Summary: This is useful debugging information to have.

Reviewers: pgopal

Reviewed By: pgopal

Differential Revision: https://phabricator.grailbio.com/D8899

fbshipit-source-id: 7a47e23
  • Loading branch information
mariusae authored and grailbot committed Dec 22, 2017
1 parent c889b20 commit 039228d
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 3 deletions.
16 changes: 15 additions & 1 deletion local/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@ type alloc struct {
mu sync.Mutex
id string
p *Pool
created time.Time
expires time.Time
lastKeepalive time.Time
freed bool
Expand Down Expand Up @@ -450,7 +451,19 @@ func (p *Pool) newAlloc(id string) *alloc {
// Note that we refresh the keepalive time on exec restore. This is
// probably a useful safeguard, but could be annoying when keepalive
// intervals are large.
return &alloc{Executor: e, id: id, p: p, expires: time.Now().Add(keepaliveInterval), remoteStream: remoteStream}
//
// TODO(marius): persist alloc states across restarts. This doesn't
// matter too much at present, as ec2 nodes are terminated when
// the reflowlet terminates, but it should be done for potential future
// implementations.
return &alloc{
Executor: e,
id: id,
p: p,
created: time.Now(),
expires: time.Now().Add(keepaliveInterval),
remoteStream: remoteStream,
}
}

// configure stores the given metadata in the alloc's directory.
Expand Down Expand Up @@ -542,6 +555,7 @@ func (a *alloc) Inspect(ctx context.Context) (pool.AllocInspect, error) {
ID: a.id,
Resources: a.meta.Want,
Meta: a.meta,
Created: a.created,
Expires: a.expires,
LastKeepalive: a.lastKeepalive,
}
Expand Down
1 change: 1 addition & 0 deletions pool/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ type AllocInspect struct {
ID string
Resources reflow.Resources
Meta AllocMeta
Created time.Time
LastKeepalive time.Time
Expires time.Time
}
Expand Down
6 changes: 6 additions & 0 deletions runner/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,9 @@ type StaticCluster struct {
func (s *StaticCluster) Allocate(ctx context.Context, min, max reflow.Resources, labels pool.Labels) (pool.Alloc, error) {
return pool.Allocate(ctx, s, min, max, labels)
}

// TracingCluster is a cluster that traces the actions of an underlying
// cluster manager.
type TracingCluster struct {
Cluster
}
23 changes: 21 additions & 2 deletions runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,10 @@ type State struct {
Args []string
// Phase holds the current phase of the run.
Phase Phase
// AllocID is the ID of the run's alloc, if any.
// AllocID is the full URI for the run's alloc.
AllocID string
// AllocInspect is the alloc's inspect output.
AllocInspect pool.AllocInspect
// Value contains the result of the evaluation,
// rendered as a string.
// TODO(marius): serialize the value into JSON.
Expand All @@ -123,18 +125,26 @@ type State struct {
LastTry time.Time
// Created is the time of the run's creation.
Created time.Time
// Completion is the time of the run's completion.
Completion time.Time

// TotalResources stores the total amount of resources used
// by this run. Note that the resources are in resource-minutes.
TotalResources reflow.Resources
}

// Reset resets the state so that it will reinitialize if run.
// Run metadata (including its name) are preserved.
func (s *State) Reset() {
s.Phase = Init
s.AllocID = ""
s.AllocInspect = pool.AllocInspect{}
s.Result = ""
s.Err = nil
s.NumTries = 0
s.LastTry = time.Time{}
s.Created = time.Time{}
s.Completion = time.Time{}
}

// String returns a string representation of the state.
Expand Down Expand Up @@ -219,6 +229,13 @@ func (r *Runner) Do(ctx context.Context) bool {
break
}
r.AllocID = r.Alloc.ID()
var err error
r.AllocInspect, err = r.Alloc.Inspect(ctx)
if err != nil {
r.Err = errors.Recover(err)
r.Phase = Done
break
}
r.Phase = Eval
case Eval:
r.LastTry = time.Now()
Expand All @@ -237,6 +254,7 @@ func (r *Runner) Do(ctx context.Context) bool {
r.Result, err = r.Eval(ctx)
if err == nil {
r.Phase = Done
r.Completion = time.Now()
break
}
r.Err = errors.Recover(err)
Expand All @@ -247,6 +265,7 @@ func (r *Runner) Do(ctx context.Context) bool {
r.Phase = Retry
} else {
r.Log.Debugf("marking run done after nonrecoverable error %v", r.Err)
r.Completion = time.Now()
r.Phase = Done
}
case Retry:
Expand All @@ -258,6 +277,7 @@ func (r *Runner) Do(ctx context.Context) bool {
r.NumTries++
if r.NumTries > maxTries {
r.Err = errors.Recover(errors.E(errors.TooManyTries, r.Err))
r.Completion = time.Now()
r.Phase = Done
break
}
Expand All @@ -268,7 +288,6 @@ func (r *Runner) Do(ctx context.Context) bool {
time.Sleep(w)
r.Phase = Init
r.Err = nil
case Done:
}
return r.Phase != Done
}
Expand Down
3 changes: 3 additions & 0 deletions tool/info.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,9 @@ func (c *Cmd) printRunInfo(ctx context.Context, w io.Writer, name runner.Name) {
if state.AllocID != "" {
fmt.Fprintf(w, "\talloc:\t%s\n", state.AllocID)
}
if !state.AllocInspect.Resources.IsZeroAll() {
fmt.Fprintf(w, "\tresources:\t%s\n", state.AllocInspect.Resources)
}
if state.Err != nil {
fmt.Fprintf(w, "\terror:\t%s\n", state.Err)
}
Expand Down

0 comments on commit 039228d

Please sign in to comment.