Skip to content
This repository was archived by the owner on Oct 9, 2023. It is now read-only.

Commit 8b417f4

Browse files
authored
Add ray dashboard log link (#405)
* Add ray dashboard log link Signed-off-by: Haytham Abuelfutuh <haytham@afutuh.com> * A few fixes Signed-off-by: Haytham Abuelfutuh <haytham@afutuh.com> * lint Signed-off-by: Haytham Abuelfutuh <haytham@afutuh.com> * lint Signed-off-by: Haytham Abuelfutuh <haytham@afutuh.com> * Add EnableUsageStats start parameter Signed-off-by: Haytham Abuelfutuh <haytham@afutuh.com> * Add more tests Signed-off-by: Haytham Abuelfutuh <haytham@afutuh.com> * more tests Signed-off-by: Haytham Abuelfutuh <haytham@afutuh.com> --------- Signed-off-by: Haytham Abuelfutuh <haytham@afutuh.com>
1 parent 2598c96 commit 8b417f4

File tree

7 files changed

+243
-50
lines changed

7 files changed

+243
-50
lines changed

go/tasks/config/config.go

+6-2
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,19 @@ var (
1212
rootSection = config.MustRegisterSection(configSectionKey, &Config{})
1313
)
1414

15-
// Top level plugins config.
15+
// Config is the top level plugins config.
1616
type Config struct {
1717
}
1818

19-
// Retrieves the current config value or default.
19+
// GetConfig retrieves the current config value or default.
2020
func GetConfig() *Config {
2121
return rootSection.GetConfig().(*Config)
2222
}
2323

2424
func MustRegisterSubSection(subSectionKey string, section config.Config) config.Section {
2525
return rootSection.MustRegisterSection(subSectionKey, section)
2626
}
27+
28+
func MustRegisterSubSectionWithUpdates(subSectionKey string, section config.Config, sectionUpdatedFn config.SectionUpdated) config.Section {
29+
return rootSection.MustRegisterSectionWithUpdates(subSectionKey, section, sectionUpdatedFn)
30+
}

go/tasks/pluginmachinery/core/phase.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ func phaseInfo(p Phase, v uint32, err *core.ExecutionError, info *TaskInfo, clea
184184
}
185185
}
186186

187-
// Return in the case the plugin is not ready to start
187+
// PhaseInfoNotReady represents the case the plugin is not ready to start
188188
func PhaseInfoNotReady(t time.Time, version uint32, reason string) PhaseInfo {
189189
pi := phaseInfo(PhaseNotReady, version, nil, &TaskInfo{OccurredAt: &t}, false)
190190
pi.reason = reason
@@ -198,7 +198,7 @@ func PhaseInfoWaitingForResources(t time.Time, version uint32, reason string) Ph
198198
return pi
199199
}
200200

201-
// Return in the case the plugin is not ready to start
201+
// PhaseInfoWaitingForResourcesInfo represents the case the plugin is not ready to start
202202
func PhaseInfoWaitingForResourcesInfo(t time.Time, version uint32, reason string, info *TaskInfo) PhaseInfo {
203203
pi := phaseInfo(PhaseWaitingForResources, version, nil, info, false)
204204
pi.reason = reason

go/tasks/plugins/k8s/ray/config.go

+50-4
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
package ray
22

33
import (
4+
"context"
5+
46
pluginsConfig "github.com/flyteorg/flyteplugins/go/tasks/config"
7+
"github.com/flyteorg/flyteplugins/go/tasks/logs"
58
pluginmachinery "github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/k8s"
9+
"github.com/flyteorg/flytestdlib/config"
610
)
711

812
//go:generate pflags Config --default-var=defaultConfig
@@ -14,10 +18,39 @@ var (
1418
ServiceType: "NodePort",
1519
IncludeDashboard: true,
1620
DashboardHost: "0.0.0.0",
17-
NodeIPAddress: "$MY_POD_IP",
21+
EnableUsageStats: false,
22+
Defaults: DefaultConfig{
23+
HeadNode: NodeConfig{
24+
StartParameters: map[string]string{
25+
// Disable usage reporting by default: https://docs.ray.io/en/latest/cluster/usage-stats.html
26+
DisableUsageStatsStartParameter: "true",
27+
},
28+
IPAddress: "$MY_POD_IP",
29+
},
30+
WorkerNode: NodeConfig{
31+
StartParameters: map[string]string{
32+
// Disable usage reporting by default: https://docs.ray.io/en/latest/cluster/usage-stats.html
33+
DisableUsageStatsStartParameter: "true",
34+
},
35+
IPAddress: "$MY_POD_IP",
36+
},
37+
},
1838
}
1939

20-
configSection = pluginsConfig.MustRegisterSubSection("ray", &defaultConfig)
40+
configSection = pluginsConfig.MustRegisterSubSectionWithUpdates("ray", &defaultConfig,
41+
func(ctx context.Context, newValue config.Config) {
42+
if newValue == nil {
43+
return
44+
}
45+
46+
if len(newValue.(*Config).Defaults.HeadNode.IPAddress) == 0 {
47+
newValue.(*Config).Defaults.HeadNode.IPAddress = newValue.(*Config).DeprecatedNodeIPAddress
48+
}
49+
50+
if len(newValue.(*Config).Defaults.WorkerNode.IPAddress) == 0 {
51+
newValue.(*Config).Defaults.WorkerNode.IPAddress = newValue.(*Config).DeprecatedNodeIPAddress
52+
}
53+
})
2154
)
2255

2356
// Config is config for 'ray' plugin
@@ -39,11 +72,24 @@ type Config struct {
3972
// or 0.0.0.0 (available from all interfaces). By default, this is localhost.
4073
DashboardHost string `json:"dashboardHost,omitempty"`
4174

42-
// NodeIPAddress the IP address of the head node. By default, this is pod ip address.
43-
NodeIPAddress string `json:"nodeIPAddress,omitempty"`
75+
// DeprecatedNodeIPAddress the IP address of the head node. By default, this is pod ip address.
76+
DeprecatedNodeIPAddress string `json:"nodeIPAddress,omitempty" pflag:"-,DEPRECATED. Please use DefaultConfig.[HeadNode|WorkerNode].IPAddress"`
4477

4578
// Remote Ray Cluster Config
4679
RemoteClusterConfig pluginmachinery.ClusterConfig `json:"remoteClusterConfig" pflag:"Configuration of remote K8s cluster for ray jobs"`
80+
Logs logs.LogConfig `json:"logs" pflag:"-,Log configuration for ray jobs"`
81+
Defaults DefaultConfig `json:"defaults" pflag:"-,Default configuration for ray jobs"`
82+
EnableUsageStats bool `json:"enableUsageStats" pflag:",Enable usage stats for ray jobs. These stats are submitted to usage-stats.ray.io per https://docs.ray.io/en/latest/cluster/usage-stats.html"`
83+
}
84+
85+
type DefaultConfig struct {
86+
HeadNode NodeConfig `json:"headNode,omitempty" pflag:"-,Default configuration for head node of ray jobs"`
87+
WorkerNode NodeConfig `json:"workerNode,omitempty" pflag:"-,Default configuration for worker node of ray jobs"`
88+
}
89+
90+
type NodeConfig struct {
91+
StartParameters map[string]string `json:"startParameters,omitempty" pflag:"-,Start parameters for the node"`
92+
IPAddress string `json:"ipAddress,omitempty" pflag:"-,IP address of the node"`
4793
}
4894

4995
func GetConfig() *Config {

go/tasks/plugins/k8s/ray/config_flags.go

+1-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

go/tasks/plugins/k8s/ray/config_flags_test.go

+14-14
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

go/tasks/plugins/k8s/ray/ray.go

+51-25
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ import (
55
"fmt"
66
"strconv"
77
"strings"
8-
"time"
98

10-
"github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/core"
9+
"github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery/tasklog"
10+
1111
"github.com/flyteorg/flyteidl/gen/pb-go/flyteidl/plugins"
1212
"github.com/flyteorg/flyteplugins/go/tasks/logs"
1313
"github.com/flyteorg/flyteplugins/go/tasks/pluginmachinery"
@@ -27,11 +27,12 @@ import (
2727
)
2828

2929
const (
30-
rayTaskType = "ray"
31-
KindRayJob = "RayJob"
32-
IncludeDashboard = "include-dashboard"
33-
NodeIPAddress = "node-ip-address"
34-
DashboardHost = "dashboard-host"
30+
rayTaskType = "ray"
31+
KindRayJob = "RayJob"
32+
IncludeDashboard = "include-dashboard"
33+
NodeIPAddress = "node-ip-address"
34+
DashboardHost = "dashboard-host"
35+
DisableUsageStatsStartParameter = "disable-usage-stats"
3536
)
3637

3738
type rayJobResourceHandler struct {
@@ -57,7 +58,6 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC
5758
}
5859

5960
podSpec, objectMeta, primaryContainerName, err := flytek8s.ToK8sPodSpec(ctx, taskCtx)
60-
6161
if err != nil {
6262
return nil, flyteerr.Errorf(flyteerr.BadTaskSpecification, "Unable to create pod spec: [%v]", err.Error())
6363
}
@@ -76,26 +76,36 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC
7676
return nil, flyteerr.Errorf(flyteerr.BadTaskSpecification, "Unable to get primary container from the pod: [%v]", err.Error())
7777
}
7878

79+
cfg := GetConfig()
7980
headReplicas := int32(1)
8081
headNodeRayStartParams := make(map[string]string)
8182
if rayJob.RayCluster.HeadGroupSpec != nil && rayJob.RayCluster.HeadGroupSpec.RayStartParams != nil {
8283
headNodeRayStartParams = rayJob.RayCluster.HeadGroupSpec.RayStartParams
84+
} else if headNode := cfg.Defaults.HeadNode; len(headNode.StartParameters) > 0 {
85+
headNodeRayStartParams = headNode.StartParameters
8386
}
87+
8488
if _, exist := headNodeRayStartParams[IncludeDashboard]; !exist {
8589
headNodeRayStartParams[IncludeDashboard] = strconv.FormatBool(GetConfig().IncludeDashboard)
8690
}
91+
8792
if _, exist := headNodeRayStartParams[NodeIPAddress]; !exist {
88-
headNodeRayStartParams[NodeIPAddress] = GetConfig().NodeIPAddress
93+
headNodeRayStartParams[NodeIPAddress] = cfg.Defaults.HeadNode.IPAddress
8994
}
95+
9096
if _, exist := headNodeRayStartParams[DashboardHost]; !exist {
91-
headNodeRayStartParams[DashboardHost] = GetConfig().DashboardHost
97+
headNodeRayStartParams[DashboardHost] = cfg.DashboardHost
98+
}
99+
100+
if _, exists := headNodeRayStartParams[DisableUsageStatsStartParameter]; !exists && !cfg.EnableUsageStats {
101+
headNodeRayStartParams[DisableUsageStatsStartParameter] = "true"
92102
}
93103

94104
enableIngress := true
95105
rayClusterSpec := rayv1alpha1.RayClusterSpec{
96106
HeadGroupSpec: rayv1alpha1.HeadGroupSpec{
97107
Template: buildHeadPodTemplate(&container, podSpec, objectMeta, taskCtx),
98-
ServiceType: v1.ServiceType(GetConfig().ServiceType),
108+
ServiceType: v1.ServiceType(cfg.ServiceType),
99109
Replicas: &headReplicas,
100110
EnableIngress: &enableIngress,
101111
RayStartParams: headNodeRayStartParams,
@@ -111,16 +121,24 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC
111121
if spec.MinReplicas != 0 {
112122
minReplicas = spec.MinReplicas
113123
}
124+
114125
if spec.MaxReplicas != 0 {
115126
maxReplicas = spec.MaxReplicas
116127
}
117128

118129
workerNodeRayStartParams := make(map[string]string)
119130
if spec.RayStartParams != nil {
120131
workerNodeRayStartParams = spec.RayStartParams
132+
} else if workerNode := cfg.Defaults.WorkerNode; len(workerNode.StartParameters) > 0 {
133+
workerNodeRayStartParams = workerNode.StartParameters
121134
}
135+
122136
if _, exist := workerNodeRayStartParams[NodeIPAddress]; !exist {
123-
workerNodeRayStartParams[NodeIPAddress] = GetConfig().NodeIPAddress
137+
workerNodeRayStartParams[NodeIPAddress] = cfg.Defaults.WorkerNode.IPAddress
138+
}
139+
140+
if _, exists := workerNodeRayStartParams[DisableUsageStatsStartParameter]; !exists && !cfg.EnableUsageStats {
141+
workerNodeRayStartParams[DisableUsageStatsStartParameter] = "true"
124142
}
125143

126144
workerNodeSpec := rayv1alpha1.WorkerGroupSpec{
@@ -145,8 +163,8 @@ func (rayJobResourceHandler) BuildResource(ctx context.Context, taskCtx pluginsC
145163
jobSpec := rayv1alpha1.RayJobSpec{
146164
RayClusterSpec: rayClusterSpec,
147165
Entrypoint: strings.Join(container.Args, " "),
148-
ShutdownAfterJobFinishes: GetConfig().ShutdownAfterJobFinishes,
149-
TTLSecondsAfterFinished: &GetConfig().TTLSecondsAfterFinished,
166+
ShutdownAfterJobFinishes: cfg.ShutdownAfterJobFinishes,
167+
TTLSecondsAfterFinished: &cfg.TTLSecondsAfterFinished,
150168
RuntimeEnv: rayJob.RuntimeEnv,
151169
}
152170

@@ -347,12 +365,10 @@ func (rayJobResourceHandler) BuildIdentityResource(ctx context.Context, taskCtx
347365
}, nil
348366
}
349367

350-
func getEventInfoForRayJob() (*pluginsCore.TaskInfo, error) {
351-
taskLogs := make([]*core.TaskLog, 0, 3)
352-
logPlugin, err := logs.InitializeLogPlugins(logs.GetLogConfig())
353-
368+
func getEventInfoForRayJob(logConfig logs.LogConfig, pluginContext k8s.PluginContext, rayJob *rayv1alpha1.RayJob) (*pluginsCore.TaskInfo, error) {
369+
logPlugin, err := logs.InitializeLogPlugins(&logConfig)
354370
if err != nil {
355-
return nil, err
371+
return nil, fmt.Errorf("failed to initialize log plugins. Error: %w", err)
356372
}
357373

358374
if logPlugin == nil {
@@ -362,22 +378,31 @@ func getEventInfoForRayJob() (*pluginsCore.TaskInfo, error) {
362378
// TODO: Retrieve the name of head pod from rayJob.status, and add it to task logs
363379
// RayJob CRD does not include the name of the worker or head pod for now
364380

365-
// TODO: Add ray Dashboard URI to task logs
381+
taskID := pluginContext.TaskExecutionMetadata().GetTaskExecutionID().GetID()
382+
logOutput, err := logPlugin.GetTaskLogs(tasklog.Input{
383+
Namespace: rayJob.Namespace,
384+
TaskExecutionIdentifier: &taskID,
385+
})
386+
387+
if err != nil {
388+
return nil, fmt.Errorf("failed to generate task logs. Error: %w", err)
389+
}
366390

367391
return &pluginsCore.TaskInfo{
368-
Logs: taskLogs,
392+
Logs: logOutput.TaskLogs,
369393
}, nil
370394
}
371395

372-
func (rayJobResourceHandler) GetTaskPhase(ctx context.Context, pluginContext k8s.PluginContext, resource client.Object) (pluginsCore.PhaseInfo, error) {
396+
func (plugin rayJobResourceHandler) GetTaskPhase(ctx context.Context, pluginContext k8s.PluginContext, resource client.Object) (pluginsCore.PhaseInfo, error) {
373397
rayJob := resource.(*rayv1alpha1.RayJob)
374-
info, err := getEventInfoForRayJob()
398+
info, err := getEventInfoForRayJob(GetConfig().Logs, pluginContext, rayJob)
375399
if err != nil {
376400
return pluginsCore.PhaseInfoUndefined, err
377401
}
402+
378403
switch rayJob.Status.JobStatus {
379404
case rayv1alpha1.JobStatusPending:
380-
return pluginsCore.PhaseInfoNotReady(time.Now(), pluginsCore.DefaultPhaseVersion, "job is pending"), nil
405+
return pluginsCore.PhaseInfoInitializing(rayJob.Status.StartTime.Time, pluginsCore.DefaultPhaseVersion, "job is pending", info), nil
381406
case rayv1alpha1.JobStatusFailed:
382407
reason := fmt.Sprintf("Failed to create Ray job: %s", rayJob.Name)
383408
return pluginsCore.PhaseInfoFailure(flyteerr.TaskFailedWithError, reason, info), nil
@@ -386,7 +411,8 @@ func (rayJobResourceHandler) GetTaskPhase(ctx context.Context, pluginContext k8s
386411
case rayv1alpha1.JobStatusRunning:
387412
return pluginsCore.PhaseInfoRunning(pluginsCore.DefaultPhaseVersion, info), nil
388413
}
389-
return pluginsCore.PhaseInfoQueued(time.Now(), pluginsCore.DefaultPhaseVersion, "JobCreated"), nil
414+
415+
return pluginsCore.PhaseInfoQueued(rayJob.CreationTimestamp.Time, pluginsCore.DefaultPhaseVersion, "JobCreated"), nil
390416
}
391417

392418
func init() {

0 commit comments

Comments
 (0)