-
Notifications
You must be signed in to change notification settings - Fork 6.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[core][usability] Disambiguate ObjectLostErrors for better understandability #18292
Changes from 12 commits
6cdcca8
50d761f
e8d5f21
8916315
d7d5b4c
8dd540d
2c06021
16b1af5
506d1fb
bd79ca6
ed861be
5a5fbf1
f493524
33e0187
1c0922c
af61dab
1b214da
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -186,8 +186,10 @@ cdef RayObjectsToDataMetadataPairs( | |
cdef VectorToObjectRefs(const c_vector[CObjectReference] &object_refs): | ||
result = [] | ||
for i in range(object_refs.size()): | ||
result.append(ObjectRef(object_refs[i].object_id(), | ||
object_refs[i].call_site())) | ||
result.append(ObjectRef( | ||
object_refs[i].object_id(), | ||
object_refs[i].owner_address().SerializeAsString(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this add extra overheads? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This call takes about 1-2us, total |
||
object_refs[i].call_site())) | ||
return result | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -3,7 +3,9 @@ | |||||
|
||||||
import ray.cloudpickle as pickle | ||||||
from ray.core.generated.common_pb2 import RayException, Language, PYTHON | ||||||
from ray.core.generated.common_pb2 import Address | ||||||
import ray.ray_constants as ray_constants | ||||||
from ray._raylet import WorkerID | ||||||
import colorama | ||||||
import setproctitle | ||||||
|
||||||
|
@@ -171,7 +173,7 @@ def __str__(self): | |||||
# due to the dependency failure. | ||||||
# Print out an user-friendly | ||||||
# message to explain that.. | ||||||
out.append(" Some of the input arguments for " | ||||||
out.append(" At least one of the input arguments for " | ||||||
"this task could not be computed:") | ||||||
if i + 1 < len(lines) and lines[i + 1].startswith(" "): | ||||||
# If the next line is indented with 2 space, | ||||||
|
@@ -279,32 +281,110 @@ def __str__(self): | |||||
"to list active objects in the cluster.") | ||||||
|
||||||
|
||||||
class ObjectLostError(RayError): | ||||||
"""Indicates that an object has been lost due to node failure. | ||||||
# TODO(XXX): Replace with ObjectLostError for backwards compatibility once all | ||||||
# Python tests pass. | ||||||
class ObjectUnreachableError(RayError): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll switch this back later, just putting this here for now so that I can see which Python tests would fail right now. |
||||||
"""Generic error for objects that are unreachable due to node failure or | ||||||
system error. | ||||||
|
||||||
Attributes: | ||||||
object_ref_hex: Hex ID of the object. | ||||||
""" | ||||||
|
||||||
def __init__(self, object_ref_hex, call_site): | ||||||
def __init__(self, object_ref_hex, owner_address, call_site): | ||||||
self.object_ref_hex = object_ref_hex | ||||||
self.owner_address = owner_address | ||||||
self.call_site = call_site.replace( | ||||||
ray_constants.CALL_STACK_LINE_DELIMITER, "\n ") | ||||||
|
||||||
def __str__(self): | ||||||
msg = (f"Object {self.object_ref_hex} cannot be retrieved due to node " | ||||||
"failure or system error.") | ||||||
msg = f"Failed to retrieve object {self.object_ref_hex}. " | ||||||
if self.call_site: | ||||||
msg += (f" The ObjectRef was created at: {self.call_site}") | ||||||
msg += (f"The ObjectRef was created at: {self.call_site}") | ||||||
else: | ||||||
msg += ( | ||||||
" To see information about where this ObjectRef was created " | ||||||
"To see information about where this ObjectRef was created " | ||||||
"in Python, set the environment variable " | ||||||
"RAY_record_ref_creation_sites=1 during `ray start` and " | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We also need to set this for driver. Why don't we write a doc and make a link instead? (I think it is confusing for users anyway when they just read it) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added a doc, but left it out of the error message for now. I think it's best if we try to make the error messages standalone. |
||||||
"`ray.init()`.") | ||||||
return msg | ||||||
|
||||||
|
||||||
class ObjectLostError(ObjectUnreachableError): | ||||||
"""Indicates that the object is lost from distributed memory, due to | ||||||
node failure or system error. | ||||||
|
||||||
Attributes: | ||||||
object_ref_hex: Hex ID of the object. | ||||||
""" | ||||||
|
||||||
def __str__(self): | ||||||
return super().__str__() + "\n\n" + ( | ||||||
f"All copies of {self.object_ref_hex} have been lost due to node " | ||||||
"failure. Check cluster logs (`/tmp/ray/session_latest/logs`) for " | ||||||
"more information about the failure.") | ||||||
|
||||||
|
||||||
class ObjectDeletedError(ObjectUnreachableError): | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change to assertion failure class? |
||||||
"""Indicates that an object has been deleted while there was still a | ||||||
reference to it. | ||||||
|
||||||
Attributes: | ||||||
object_ref_hex: Hex ID of the object. | ||||||
""" | ||||||
|
||||||
def __str__(self): | ||||||
return super().__str__() + "\n\n" + ( | ||||||
"The object has already been deleted by the reference counting " | ||||||
"protocol. This should not happen.") | ||||||
|
||||||
|
||||||
class OwnerDiedError(ObjectUnreachableError): | ||||||
"""Indicates that the owner of the object has died while there is still a | ||||||
reference to the object. | ||||||
|
||||||
Attributes: | ||||||
object_ref_hex: Hex ID of the object. | ||||||
""" | ||||||
|
||||||
def __str__(self): | ||||||
log_loc = "`/tmp/ray/session_latest/logs`" | ||||||
if self.owner_address: | ||||||
try: | ||||||
addr = Address() | ||||||
addr.ParseFromString(self.owner_address) | ||||||
ip_addr = addr.ip_address | ||||||
worker_id = WorkerID(addr.worker_id) | ||||||
log_loc = ( | ||||||
f"`/tmp/ray/session_latest/logs/*{worker_id.hex()}*`" | ||||||
f" at IP address {ip_addr}") | ||||||
except Exception: | ||||||
# Catch all to make sure we always at least print the default | ||||||
# message. | ||||||
pass | ||||||
|
||||||
return super().__str__() + "\n\n" + ( | ||||||
"The object's owner has exited. This is the Python " | ||||||
"worker that first created the ObjectRef via `.remote()` or " | ||||||
"`ray.put()`. " | ||||||
f"Check cluster logs ({log_loc}) for more " | ||||||
"information about the Python worker failure.") | ||||||
|
||||||
|
||||||
class ObjectReconstructionFailedError(ObjectUnreachableError): | ||||||
"""Indicates that the owner of the object has died while there is still a | ||||||
reference to the object. | ||||||
|
||||||
Attributes: | ||||||
object_ref_hex: Hex ID of the object. | ||||||
""" | ||||||
|
||||||
def __str__(self): | ||||||
return super().__str__() + "\n\n" + ( | ||||||
f"The object cannot be reconstructed " | ||||||
"because the maximum number of task retries has been exceeded.") | ||||||
|
||||||
|
||||||
class GetTimeoutError(RayError): | ||||||
"""Indicates that a call to the worker timed out.""" | ||||||
pass | ||||||
|
@@ -338,6 +418,9 @@ def __str__(self): | |||||
RayActorError, | ||||||
ObjectStoreFullError, | ||||||
ObjectLostError, | ||||||
ObjectDeletedError, | ||||||
ObjectReconstructionFailedError, | ||||||
OwnerDiedError, | ||||||
GetTimeoutError, | ||||||
AsyncioActorExit, | ||||||
RuntimeEnvSetupError, | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about we link to a github issue here? I don't think we should document known bugs in the docs.