Skip to content

Nodes: Some Bugfixes and new attributes #318

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions pyslurm/core/node.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,12 @@ cdef class Nodes(MultiClusterMap):
Attributes:
free_memory (int):
Amount of free memory in this node collection. (in Mebibytes)
Note that this means actual free memory as returned by the `free`
command
real_memory (int):
Amount of real memory in this node collection. (in Mebibytes)
idle_memory (int):
Amount of idle memory in this node collection. (in Mebibytes)
allocated_memory (int):
Amount of alloc Memory in this node collection. (in Mebibytes)
total_cpus (int):
Expand Down Expand Up @@ -100,7 +104,7 @@ cdef class Node:

Other Parameters:
configured_gres (dict):
Configured GRES for the node
Configured GRES for the node
address (str):
Address of the node
hostname (str):
Expand Down Expand Up @@ -160,6 +164,10 @@ cdef class Node:
Real Memory in Mebibytes configured for this node.
free_memory (int):
Free Memory in Mebibytes on the node.
Note that this means actual free memory as returned by the `free`
command
idle_memory (int):
Idle Memory in Mebibytes on the node.
memory_reserved_for_system (int):
Memory in Mebibytes reserved for the System not usable by Jobs.
temporary_disk (int):
Expand Down Expand Up @@ -194,6 +202,8 @@ cdef class Node:
Time this node was last busy, as unix timestamp.
reason_time (int):
Time the reason was set for the node, as unix timestamp.
allocated_tres (dict):
Currently allocated Trackable Resources
allocated_cpus (int):
Number of allocated CPUs on the node.
idle_cpus (int):
Expand Down Expand Up @@ -235,4 +245,4 @@ cdef class Node:

@staticmethod
cdef Node from_ptr(node_info_t *in_ptr)

83 changes: 56 additions & 27 deletions pyslurm/core/node.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,14 @@ from pyslurm import xcollections
from pyslurm.utils.helpers import (
uid_to_name,
gid_to_name,
humanize,
humanize,
_getgrall_to_dict,
_getpwall_to_dict,
cpubind_to_num,
instance_to_dict,
nodelist_from_range_str,
nodelist_to_range_str,
gres_from_tres_dict,
)


Expand All @@ -65,7 +66,7 @@ cdef class Nodes(MultiClusterMap):
"""Load all nodes in the system.

Args:
preload_passwd_info (bool):
preload_passwd_info (bool):
Decides whether to query passwd and groups information from
the system.
Could potentially speed up access to attributes of the Node
Expand All @@ -83,7 +84,7 @@ cdef class Nodes(MultiClusterMap):
dict passwd = {}
dict groups = {}
Nodes nodes = Nodes()
int flags = slurm.SHOW_ALL
int flags = slurm.SHOW_ALL | slurm.SHOW_DETAIL
Node node

verify_rpc(slurm_load_node(0, &nodes.info, flags))
Expand All @@ -107,14 +108,20 @@ cdef class Nodes(MultiClusterMap):
# is raised by replacing it with a zeroed-out node_info_t.
nodes.info.node_array[cnt] = nodes.tmp_info

name = node.name
if not name:
# Could be possible if there are nodes configured in
# slurm.conf that cannot be reached anymore.
continue

if preload_passwd_info:
node.passwd = passwd
node.groups = groups

cluster = node.cluster
if cluster not in nodes.data:
nodes.data[cluster] = {}
nodes.data[cluster][node.name] = node
nodes.data[cluster][name] = node

# We have extracted all pointers
nodes.info.record_count = 0
Expand Down Expand Up @@ -162,7 +169,7 @@ cdef class Nodes(MultiClusterMap):
n._alloc_umsg()
cstr.fmalloc(&n.umsg.node_names, node_str)
verify_rpc(slurm_update_node(n.umsg))

@property
def free_memory(self):
return xcollections.sum_property(self, Node.free_memory)
Expand All @@ -171,6 +178,10 @@ cdef class Nodes(MultiClusterMap):
def real_memory(self):
return xcollections.sum_property(self, Node.real_memory)

@property
def idle_memory(self):
return xcollections.sum_property(self, Node.idle_memory)

@property
def allocated_memory(self):
return xcollections.sum_property(self, Node.allocated_memory)
Expand All @@ -186,7 +197,7 @@ cdef class Nodes(MultiClusterMap):
@property
def allocated_cpus(self):
return xcollections.sum_property(self, Node.allocated_cpus)

@property
def effective_cpus(self):
return xcollections.sum_property(self, Node.effective_cpus)
Expand Down Expand Up @@ -237,7 +248,7 @@ cdef class Node:
xfree(self.info)

def __dealloc__(self):
self._dealloc_impl()
self._dealloc_impl()

def __setattr__(self, name, val):
# When a user wants to set attributes on a Node instance that was
Expand All @@ -264,7 +275,7 @@ cdef class Node:
cdef _swap_data(Node dst, Node src):
cdef node_info_t *tmp = NULL
if dst.info and src.info:
tmp = dst.info
tmp = dst.info
dst.info = src.info
src.info = tmp

Expand Down Expand Up @@ -319,7 +330,7 @@ cdef class Node:
Implements the slurm_create_node RPC.

Args:
state (str, optional):
state (str, optional):
An optional state the created Node should have. Allowed values
are `future` and `cloud`. `future` is the default.

Expand Down Expand Up @@ -421,7 +432,7 @@ cdef class Node:

@configured_gres.setter
def configured_gres(self, val):
cstr.fmalloc2(&self.info.gres, &self.umsg.gres,
cstr.fmalloc2(&self.info.gres, &self.umsg.gres,
cstr.from_gres_dict(val))

@property
Expand Down Expand Up @@ -451,7 +462,7 @@ cdef class Node:
@extra.setter
def extra(self, val):
cstr.fmalloc2(&self.info.extra, &self.umsg.extra, val)

@property
def reason(self):
return cstr.to_unicode(self.info.reason)
Expand Down Expand Up @@ -486,7 +497,7 @@ cdef class Node:

@property
def allocated_gres(self):
return cstr.to_gres_dict(self.info.gres_used)
return gres_from_tres_dict(self.allocated_tres)

@property
def mcs_label(self):
Expand All @@ -511,6 +522,11 @@ cdef class Node:
def free_memory(self):
return u64_parse(self.info.free_mem)

@property
def idle_memory(self):
real = self.real_memory
return 0 if not real else real - self.allocated_memory

@property
def memory_reserved_for_system(self):
return u64_parse(self.info.mem_spec_limit)
Expand Down Expand Up @@ -596,17 +612,17 @@ cdef class Node:
# """dict: TRES that are configured on the node."""
# return cstr.to_dict(self.info.tres_fmt_str)

# @property
# def tres_alloc(self):
# cdef char *alloc_tres = NULL
# if self.info.select_nodeinfo:
# slurm_get_select_nodeinfo(
# self.info.select_nodeinfo,
# slurm.SELECT_NODEDATA_TRES_ALLOC_FMT_STR,
# slurm.NODE_STATE_ALLOCATED,
# &alloc_tres
# )
# return cstr.to_gres_dict(alloc_tres)
@property
def allocated_tres(self):
cdef char *alloc_tres = NULL
if self.info.select_nodeinfo:
slurm_get_select_nodeinfo(
self.info.select_nodeinfo,
slurm.SELECT_NODEDATA_TRES_ALLOC_FMT_STR,
slurm.NODE_STATE_ALLOCATED,
&alloc_tres
)
return cstr.to_dict(alloc_tres)

@property
def allocated_cpus(self):
Expand Down Expand Up @@ -671,10 +687,22 @@ cdef class Node:
"temperature": u32_parse(self.info.ext_sensors.temperature)
}

@property
def _node_state(self):
idle_cpus = self.idle_cpus
state = self.info.node_state

if idle_cpus and idle_cpus != self.effective_cpus:
# If we aren't idle but also not allocated, then set state to
# MIXED.
state &= slurm.NODE_STATE_FLAGS
state |= slurm.NODE_STATE_MIXED

return state

@property
def state(self):
cdef char* state = slurm_node_state_string_complete(
self.info.node_state)
cdef char* state = slurm_node_state_string_complete(self._node_state)
state_str = cstr.to_unicode(state)
xfree(state)
return state_str
Expand All @@ -685,9 +713,10 @@ cdef class Node:

@property
def next_state(self):
state = self._node_state
if ((self.info.next_state != slurm.NO_VAL)
and (self.info.node_state & slurm.NODE_STATE_REBOOT_REQUESTED
or self.info.node_state & slurm.NODE_STATE_REBOOT_ISSUED)):
and (state & slurm.NODE_STATE_REBOOT_REQUESTED
or state & slurm.NODE_STATE_REBOOT_ISSUED)):
return cstr.to_unicode(
slurm_node_state_string(self.info.next_state))
else:
Expand Down
21 changes: 11 additions & 10 deletions pyslurm/utils/cstr.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ cpdef dict to_dict(char *str_dict, str delim1=",", str delim2="="):
which can easily be converted to a dict.
"""
cdef:
str _str_dict = to_unicode(str_dict)
str _str_dict = to_unicode(str_dict)
str key, val
dict out = {}

Expand All @@ -143,7 +143,7 @@ cpdef dict to_dict(char *str_dict, str delim1=",", str delim2="="):
for kv in _str_dict.split(delim1):
if delim2 in kv:
key, val = kv.split(delim2, 1)
out[key] = val
out[key] = int(val) if val.isdigit() else val

return out

Expand Down Expand Up @@ -184,10 +184,10 @@ def dict_to_str(vals, prepend=None, delim1=",", delim2="="):

if isinstance(vals, str):
tmp_dict = validate_str_key_value_format(vals, delim1, delim2)

for k, v in tmp_dict.items():
if ((delim1 in str(k) or delim2 in str(k)) or
delim1 in str(v) or delim2 in str(v)):
delim1 in str(v) or delim2 in str(v)):
raise ValueError(
f"Key or Value cannot contain either {delim1} or {delim2}. "
f"Got Key: {k} and Value: {v}."
Expand All @@ -208,22 +208,23 @@ cpdef dict to_gres_dict(char *gres):
cdef:
dict output = {}
str gres_str = to_unicode(gres)
str gres_delim = "gres:"

if not gres_str or gres_str == "(null)":
return {}

for item in re.split(",(?=[^,]+?:)", gres_str):

# Remove the additional "gres" specifier if it exists
if "gres:" in item:
item = item.replace("gres:", "")
if gres_delim in item:
item = item.replace(gres_delim, "")

gres_splitted = re.split(
":(?=[^:]+?)",
":(?=[^:]+?)",
item.replace("(", ":", 1).replace(")", "")
)

name, typ, cnt = gres_splitted[0], gres_splitted[1], 0
name, typ, cnt = gres_splitted[0], gres_splitted[1], 0

# Check if we have a gres type.
if typ.isdigit():
Expand All @@ -243,10 +244,10 @@ cpdef dict to_gres_dict(char *gres):
# Cover cases with IDX
idx = gres_splitted[3] if not typ else gres_splitted[4]
output[name_and_typ] = {
"count": cnt,
"count": int(cnt),
"indexes": idx,
}

return output


Expand Down
1 change: 1 addition & 0 deletions pyslurm/utils/helpers.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ from libc.stdlib cimport free

cpdef uid_to_name(uint32_t uid, err_on_invalid=*, dict lookup=*)
cpdef gid_to_name(uint32_t gid, err_on_invalid=*, dict lookup=*)
cpdef gres_from_tres_dict(dict tres_dict)
15 changes: 12 additions & 3 deletions pyslurm/utils/helpers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def user_to_uid(user, err_on_invalid=True):
try:
if isinstance(user, str) and not user.isdigit():
return getpwnam(user).pw_uid

return getpwuid(int(user)).pw_uid
except KeyError as e:
if err_on_invalid:
Expand Down Expand Up @@ -208,7 +208,7 @@ def nodelist_to_range_str(nodelist):
char *nl = nodelist
slurm.hostlist_t hl
char *hl_ranged = NULL

hl = slurm.slurm_hostlist_create(nl)
if not hl:
return None
Expand All @@ -219,7 +219,7 @@ def nodelist_to_range_str(nodelist):
free(hl_ranged)
slurm.slurm_hostlist_destroy(hl)

return out
return out


def humanize(num, decimals=1):
Expand Down Expand Up @@ -378,3 +378,12 @@ def dehumanize_step_id(sid):
return slurm.SLURM_PENDING_STEP
else:
return int(sid)


cpdef gres_from_tres_dict(dict tres_dict):
gres_prefix = "gres/"
return {
k.replace(gres_prefix, ""):v
for k, v in tres_dict.items()
if gres_prefix in k
}
Loading