Skip to content

Commit

Permalink
[Dashboard] CPU/GPU usage details in actor pane (#11269)
Browse files Browse the repository at this point in the history
  • Loading branch information
mfitton authored Oct 14, 2020
1 parent 933cf66 commit cd9dcfc
Show file tree
Hide file tree
Showing 6 changed files with 175 additions and 109 deletions.
14 changes: 10 additions & 4 deletions dashboard/client/src/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,8 @@ export type RayConfigResponse = {

export const getRayConfig = () => get<RayConfigResponse>("/api/ray_config", {});

export type Worker = {
type ProcessStats = {
pid: number;
workerId: string;
createTime: number;
memoryInfo: {
rss: number;
vms: number;
Expand All @@ -52,6 +50,7 @@ export type Worker = {
data: number;
dirty: Number;
};
createTime: number;
cmdline: string[];
cpuTimes: {
user: number;
Expand All @@ -61,12 +60,17 @@ export type Worker = {
iowait: number;
};
cpuPercent: number;
}

export type Worker = {
pid: number;
workerId: string;
logCount: number;
errorCount: number;
language: string;
jobId: string;
coreWorkerStats: CoreWorkerStats[];
};
} & ProcessStats;

export type CoreWorkerStats = {
ipAddress: string;
Expand Down Expand Up @@ -220,12 +224,14 @@ export type FullActorInfo = {
| ActorState.DependenciesUnready
| ActorState.PendingCreation;
taskQueueLength?: number;
gpus: GPUStats[]; // Contains info about any GPUs the actor is using
timestamp: number;
usedObjectStoreMemory?: number;
usedResources: { [key: string]: ResourceAllocations };
currentTaskDesc?: string;
numPendingTasks?: number;
webuiDisplay?: Record<string, string>;
processStats?: ProcessStats;
};

export type ActorTaskInfo = {
Expand Down
82 changes: 1 addition & 81 deletions dashboard/client/src/pages/dashboard/logical-view/Actor.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,8 @@ import {
launchKillActor,
launchProfiling,
} from "../../../api";
import { sum } from "../../../common/util";
import ActorDetailsPane from "./ActorDetailsPane";

const memoryDebuggingDocLink =
"https://docs.ray.io/en/latest/memory-management.html#debugging-using-ray-memory";

const useActorStyles = makeStyles((theme: Theme) =>
createStyles({
root: {
Expand Down Expand Up @@ -103,80 +99,6 @@ const Actor: React.FC<ActorProps> = ({ actor }) => {
}
};

const information = isFullActorInfo(actor)
? [
{
label: "Resources",
value:
actor.usedResources &&
Object.entries(actor.usedResources).length > 0 &&
Object.entries(actor.usedResources)
.sort((a, b) => a[0].localeCompare(b[0]))
.map(
([key, value]) =>
`${sum(
value.resourceSlots.map((slot) => slot.allocation),
)} ${key}`,
)
.join(", "),
},
{
label: "Number of pending tasks",
value: actor.taskQueueLength?.toLocaleString() ?? "0",
tooltip:
"The number of tasks that are currently pending to execute on this actor. If this number " +
"remains consistently high, it may indicate that this actor is a bottleneck in your application.",
},
{
label: "Number of executed tasks",
value: actor.numExecutedTasks?.toLocaleString() ?? "0",
tooltip:
"The number of tasks this actor has executed throughout its lifetimes.",
},
{
label: "Number of ObjectRefs in scope",
value: actor.numObjectRefsInScope?.toLocaleString() ?? "0",
tooltip:
"The number of ObjectRefs that this actor is keeping in scope via its internal state. " +
"This does not imply that the objects are in active use or colocated on the node with the actor " +
`currently. This can be useful for debugging memory leaks. See the docs at ${memoryDebuggingDocLink} ` +
"for more information.",
},
{
label: "Number of local objects",
value: actor.numLocalObjects?.toLocaleString() ?? "0",
tooltip:
"The number of small objects that this actor has stored in its local in-process memory store. This can be useful for " +
`debugging memory leaks. See the docs at ${memoryDebuggingDocLink} for more information`,
},
{
label: "Object store memory used (MiB)",
value: actor.usedObjectStoreMemory?.toLocaleString() ?? "0",
tooltip:
"The total amount of memory that this actor is occupying in the Ray object store. " +
"If this number is increasing without bounds, you might have a memory leak. See " +
`the docs at: ${memoryDebuggingDocLink} for more information.`,
},
]
: [
{
label: "Actor ID",
value: actor.actorId,
tooltip: "",
},
{
label: "Required resources",
value:
actor.requiredResources &&
Object.entries(actor.requiredResources).length > 0 &&
Object.entries(actor.requiredResources)
.sort((a, b) => a[0].localeCompare(b[0]))
.map(([key, value]) => `${value.toLocaleString()} ${key}`)
.join(", "),
tooltip: "",
},
];

// Construct the custom message from the actor.
let actorCustomDisplay: JSX.Element[] = [];
if (isFullActorInfo(actor) && actor.webuiDisplay) {
Expand Down Expand Up @@ -274,9 +196,7 @@ const Actor: React.FC<ActorProps> = ({ actor }) => {
)}
</Typography>
<ActorDetailsPane
actorDetails={information}
actorClass={actor.actorClass}
actorState={actor.state}
actor={actor}
/>
{isFullActorInfo(actor) && (
<React.Fragment>
Expand Down
146 changes: 131 additions & 15 deletions dashboard/client/src/pages/dashboard/logical-view/ActorDetailsPane.tsx
Original file line number Diff line number Diff line change
@@ -1,17 +1,98 @@
import { Divider, Grid, makeStyles, Theme } from "@material-ui/core";
import { Divider, Grid, makeStyles, Theme, Typography } from "@material-ui/core";
import React from "react";
import { ActorState } from "../../../api";
import { ActorInfo, isFullActorInfo } from "../../../api";
import { sum } from "../../../common/util";
import LabeledDatum from "../../../common/LabeledDatum";
import ActorStateRepr from "./ActorStateRepr";
import UsageBar from '../../../common/UsageBar';

const memoryDebuggingDocLink =
"https://docs.ray.io/en/latest/memory-management.html#debugging-using-ray-memory";

type ActorDatum = {
label: string;
value: any;
tooltip?: string;
}

const labeledActorData = (actor: ActorInfo) => (
isFullActorInfo(actor)
? [
{
label: "Resources",
value:
actor.usedResources &&
Object.entries(actor.usedResources).length > 0 &&
Object.entries(actor.usedResources)
.sort((a, b) => a[0].localeCompare(b[0]))
.map(
([key, value]) =>
`${sum(
value.resourceSlots.map((slot) => slot.allocation),
)} ${key}`,
)
.join(", "),
},
{
label: "Number of pending tasks",
value: actor.taskQueueLength?.toLocaleString() ?? "0",
tooltip:
"The number of tasks that are currently pending to execute on this actor. If this number " +
"remains consistently high, it may indicate that this actor is a bottleneck in your application.",
},
{
label: "Number of executed tasks",
value: actor.numExecutedTasks?.toLocaleString() ?? "0",
tooltip:
"The number of tasks this actor has executed throughout its lifetimes.",
},
{
label: "Number of ObjectRefs in scope",
value: actor.numObjectRefsInScope?.toLocaleString() ?? "0",
tooltip:
"The number of ObjectRefs that this actor is keeping in scope via its internal state. " +
"This does not imply that the objects are in active use or colocated on the node with the actor " +
`currently. This can be useful for debugging memory leaks. See the docs at ${memoryDebuggingDocLink} ` +
"for more information.",
},
{
label: "Number of local objects",
value: actor.numLocalObjects?.toLocaleString() ?? "0",
tooltip:
"The number of small objects that this actor has stored in its local in-process memory store. This can be useful for " +
`debugging memory leaks. See the docs at ${memoryDebuggingDocLink} for more information`,
},
{
label: "Object store memory used (MiB)",
value: actor.usedObjectStoreMemory?.toLocaleString() ?? "0",
tooltip:
"The total amount of memory that this actor is occupying in the Ray object store. " +
"If this number is increasing without bounds, you might have a memory leak. See " +
`the docs at: ${memoryDebuggingDocLink} for more information.`,
},
]
: [
{
label: "Actor ID",
value: actor.actorId,
tooltip: "",
},
{
label: "Required resources",
value:
actor.requiredResources &&
Object.entries(actor.requiredResources).length > 0 &&
Object.entries(actor.requiredResources)
.sort((a, b) => a[0].localeCompare(b[0]))
.map(([key, value]) => `${value.toLocaleString()} ${key}`)
.join(", "),
tooltip: "",
},
]);


type ActorDetailsPaneProps = {
actorClass: string;
actorState: ActorState;
actorDetails: {
label: string;
value: any;
tooltip?: string;
}[];
actor: ActorInfo;
};

const useStyles = makeStyles((theme: Theme) => ({
Expand All @@ -31,20 +112,55 @@ const useStyles = makeStyles((theme: Theme) => ({
}));

const ActorDetailsPane: React.FC<ActorDetailsPaneProps> = ({
actorDetails,
actorClass,
actorState,
actor
}) => {
const classes = useStyles();
const actorData: ActorDatum[] = labeledActorData(actor);
return (
<React.Fragment>
<div className={classes.actorTitleWrapper}>
<div>{actorClass}</div>
<ActorStateRepr state={actorState} />
<div>{actor.actorClass}</div>
<ActorStateRepr state={actor.state} />
</div>
{isFullActorInfo(actor) &&
<Grid container className={classes.detailsPane}>
<Grid container item xs={6}>
<Grid item xs={4}>
<Typography>CPU Usage</Typography>
</Grid>
<Grid item xs={4}>
<UsageBar
percent={actor.processStats?.cpuPercent ?? 0}
text={`${actor.processStats?.cpuPercent ?? 0}%`}
/>
</Grid>
<Grid item xs={4} />
</Grid>
{ actor.gpus.length > 0 &&
<Grid container item xs={6}>
<Grid item xs={12}>
<Typography>GPU Usage</Typography>
</Grid>
{actor.gpus.map(gpu => (
<React.Fragment key={gpu.uuid}>
<Grid item xs={4}>
{`[${gpu.name}]`}
</Grid>
<Grid item xs={4}>
<UsageBar
percent={gpu.utilizationGpu * 100}
text={`${gpu.utilizationGpu * 100}%`}
/>
</Grid>
<Grid item xs={4} />
</React.Fragment>
))}
</Grid>
}
</Grid>}
<Divider className={classes.divider} />
<Grid container className={classes.detailsPane}>
{actorDetails.map(
{actorData.map(
({ label, value, tooltip }) =>
value &&
value.length > 0 && (
Expand Down
40 changes: 31 additions & 9 deletions dashboard/datacenter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import ray.new_dashboard.consts as dashboard_consts
import ray.new_dashboard.memory_utils as memory_utils
from collections import defaultdict
from ray.new_dashboard.actor_utils import actor_classname_from_task_spec
from ray.new_dashboard.utils import Dict, Signal

Expand Down Expand Up @@ -61,27 +62,48 @@ async def purge():
@classmethod
async def get_node_actors(cls, node_id):
node_stats = DataSource.node_stats.get(node_id, {})
worker_id_to_info = {}
node_physical_stats = DataSource.node_physical_stats.get(node_id, {})
worker_id_to_raylet_info = {}
pid_to_worker_id = {}

for worker_stats in node_stats.get("workersStats", []):
worker_id_to_info[worker_stats["workerId"]] = worker_stats
worker_id_to_raylet_info[worker_stats["workerId"]] = worker_stats
pid_to_worker_id[worker_stats["pid"]] = worker_stats["workerId"]
worker_id_to_process_info = {}

for process_stats in node_physical_stats.get("workers"):
if process_stats["pid"] in pid_to_worker_id:
worker_id = pid_to_worker_id[process_stats["pid"]]
worker_id_to_process_info[worker_id] = process_stats

worker_id_to_gpu_stats = defaultdict(list)
for gpu_stats in node_physical_stats.get("gpus"):
for process in gpu_stats.get("processes", []):
if process["pid"] in pid_to_worker_id:
worker_id = pid_to_worker_id[process["pid"]]
worker_id_to_gpu_stats[worker_id].append(gpu_stats)

node_actors = {}
for actor_id, actor_table_data in DataSource.actors.items():
if actor_table_data["address"]["workerId"] in worker_id_to_info:
worker_stats = worker_id_to_info[actor_table_data["address"][
"workerId"]]

actor_constructor = worker_stats.get("coreWorkerStats", {})\
.get("actorTitle", "Unknown actor constructor")
worker_id = actor_table_data["address"]["workerId"]
if worker_id in worker_id_to_raylet_info:
worker_raylet_stats = worker_id_to_raylet_info[worker_id]
core_worker = worker_raylet_stats.get("coreWorkerStats", {})
actor_constructor = core_worker.get(
"actorTitle", "Unknown actor constructor")

actor_table_data["actorConstructor"] = actor_constructor

actor_class = actor_classname_from_task_spec(
actor_table_data.get("taskSpec", {}))

actor_table_data["actorClass"] = actor_class
actor_table_data.update(worker_stats["coreWorkerStats"])
actor_table_data.update(core_worker)
node_actors[actor_id] = actor_table_data
actor_table_data["gpus"] = worker_id_to_gpu_stats.get(
worker_id, [])
actor_table_data["processStats"] = worker_id_to_process_info.get(
worker_id, {})
return node_actors

@classmethod
Expand Down
Loading

0 comments on commit cd9dcfc

Please sign in to comment.