Skip to content

Commit

Permalink
Make Monitor remove dead Redis entries from exiting drivers. (#994)
Browse files Browse the repository at this point in the history
* WIP: removing OL, OI, TT on client exit; no saving yet.

* ray_redis_module.cc: update header comment.

* Cleanup: just the removal.

* Reformat via yapf: use pep8 style instead of google.

* Checkpoint addressing comments (partially)

* Add 'b' marker before strings (py3 compat)

* Add MonitorTest.

* Use `isort` to sort imports.

* Remove some loggings

* Fix flake8 noqa marker runtest.py

* Try to separate tests out to monitor_test.py

* Rework cleanup algorithm: correct logic

* Extend tests to cover multi-shard cases

* Add some small comments and formatting changes.
concretevitamin authored and robertnishihara committed Sep 26, 2017
1 parent 6e9657e commit 5a50e80
Showing 7 changed files with 698 additions and 174 deletions.
190 changes: 190 additions & 0 deletions .style.yapf
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
[style]
# Align closing bracket with visual indentation.
align_closing_bracket_with_visual_indent=True

# Allow dictionary keys to exist on multiple lines. For example:
#
# x = {
# ('this is the first element of a tuple',
# 'this is the second element of a tuple'):
# value,
# }
allow_multiline_dictionary_keys=False

# Allow lambdas to be formatted on more than one line.
allow_multiline_lambdas=False

# Insert a blank line before a class-level docstring.
blank_line_before_class_docstring=False

# Insert a blank line before a 'def' or 'class' immediately nested
# within another 'def' or 'class'. For example:
#
# class Foo:
# # <------ this blank line
# def method():
# ...
blank_line_before_nested_class_or_def=False

# Do not split consecutive brackets. Only relevant when
# dedent_closing_brackets is set. For example:
#
# call_func_that_takes_a_dict(
# {
# 'key1': 'value1',
# 'key2': 'value2',
# }
# )
#
# would reformat to:
#
# call_func_that_takes_a_dict({
# 'key1': 'value1',
# 'key2': 'value2',
# })
coalesce_brackets=False

# The column limit.
column_limit=79

# Indent width used for line continuations.
continuation_indent_width=4

# Put closing brackets on a separate line, dedented, if the bracketed
# expression can't fit in a single line. Applies to all kinds of brackets,
# including function definitions and calls. For example:
#
# config = {
# 'key1': 'value1',
# 'key2': 'value2',
# } # <--- this bracket is dedented and on a separate line
#
# time_series = self.remote_client.query_entity_counters(
# entity='dev3246.region1',
# key='dns.query_latency_tcp',
# transform=Transformation.AVERAGE(window=timedelta(seconds=60)),
# start_ts=now()-timedelta(days=3),
# end_ts=now(),
# ) # <--- this bracket is dedented and on a separate line
dedent_closing_brackets=False

# Place each dictionary entry onto its own line.
each_dict_entry_on_separate_line=True

# The regex for an i18n comment. The presence of this comment stops
# reformatting of that line, because the comments are required to be
# next to the string they translate.
i18n_comment=

# The i18n function call names. The presence of this function stops
# reformattting on that line, because the string it has cannot be moved
# away from the i18n comment.
i18n_function_call=

# Indent the dictionary value if it cannot fit on the same line as the
# dictionary key. For example:
#
# config = {
# 'key1':
# 'value1',
# 'key2': value1 +
# value2,
# }
indent_dictionary_value=False

# The number of columns to use for indentation.
indent_width=4

# Join short lines into one line. E.g., single line 'if' statements.
join_multiple_lines=True

# Do not include spaces around selected binary operators. For example:
#
# 1 + 2 * 3 - 4 / 5
#
# will be formatted as follows when configured with a value "*,/":
#
# 1 + 2*3 - 4/5
#
no_spaces_around_selected_binary_operators=set([])

# Use spaces around default or named assigns.
spaces_around_default_or_named_assign=False

# Use spaces around the power operator.
spaces_around_power_operator=False

# The number of spaces required before a trailing comment.
spaces_before_comment=2

# Insert a space between the ending comma and closing bracket of a list,
# etc.
space_between_ending_comma_and_closing_bracket=True

# Split before arguments if the argument list is terminated by a
# comma.
split_arguments_when_comma_terminated=False

# Set to True to prefer splitting before '&', '|' or '^' rather than
# after.
split_before_bitwise_operator=True

# Split before a dictionary or set generator (comp_for). For example, note
# the split before the 'for':
#
# foo = {
# variable: 'Hello world, have a nice day!'
# for variable in bar if variable != 42
# }
split_before_dict_set_generator=True

# If an argument / parameter list is going to be split, then split before
# the first argument.
split_before_first_argument=False

# Set to True to prefer splitting before 'and' or 'or' rather than
# after.
split_before_logical_operator=True

# Split named assignments onto individual lines.
split_before_named_assigns=True

# The penalty for splitting right after the opening bracket.
split_penalty_after_opening_bracket=30

# The penalty for splitting the line after a unary operator.
split_penalty_after_unary_operator=10000

# The penalty for splitting right before an if expression.
split_penalty_before_if_expr=0

# The penalty of splitting the line around the '&', '|', and '^'
# operators.
split_penalty_bitwise_operator=300

# The penalty for characters over the column limit.
split_penalty_excess_character=4500

# The penalty incurred by adding a line split to the unwrapped line. The
# more line splits added the higher the penalty.
split_penalty_for_added_line_split=30

# The penalty of splitting a list of "import as" names. For example:
#
# from a_very_long_or_indented_module_name_yada_yad import (long_argument_1,
# long_argument_2,
# long_argument_3)
#
# would reformat to something like:
#
# from a_very_long_or_indented_module_name_yada_yad import (
# long_argument_1, long_argument_2, long_argument_3)
split_penalty_import_names=0

# The penalty of splitting the line around the 'and' and 'or'
# operators.
split_penalty_logical_operator=300

# Use the Tab character for indentation.
use_tabs=False

1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -117,5 +117,6 @@ script:
- python test/component_failures_test.py
- python test/multi_node_test.py
- python test/recursion_test.py
- python test/monitor_test.py

- python -m pytest python/ray/rllib/test/test_catalog.py
8 changes: 7 additions & 1 deletion python/ray/experimental/state.py
Original file line number Diff line number Diff line change
@@ -52,12 +52,18 @@
class GlobalState(object):
"""A class used to interface with the Ray control state.
# TODO(zongheng): In the future move this to use Ray's redis module in the
# backend to cut down on # of request RPCs.
Attributes:
redis_client: The redis client used to query the redis server.
redis_client: The redis client used to query the redis server.
"""
def __init__(self):
"""Create a GlobalState object."""
# The redis server storing metadata, such as function table, client
# table, log files, event logs, workers/actions info.
self.redis_client = None
# A list of redis shards, storing the object table & task table.
self.redis_clients = None

def _check_connected(self):
221 changes: 182 additions & 39 deletions python/ray/monitor.py

Large diffs are not rendered by default.

35 changes: 19 additions & 16 deletions src/common/redis_module/ray_redis_module.cc
Original file line number Diff line number Diff line change
@@ -8,22 +8,25 @@

#include "common_protocol.h"

/**
* Various tables are maintained in redis:
*
* == OBJECT TABLE ==
*
* This consists of two parts:
* - The object location table, indexed by OL:object_id, which is the set of
* plasma manager indices that have access to the object.
* - The object info table, indexed by OI:object_id, which is a hashmap with key
* "hash" for the hash of the object and key "data_size" for the size of the
* object in bytes.
*
* == TASK TABLE ==
*
* TODO(pcm): Fill this out.
*/
// Various tables are maintained in redis:
//
// == OBJECT TABLE ==
//
// This consists of two parts:
// - The object location table, indexed by OL:object_id, which is the set of
// plasma manager indices that have access to the object.
// (In redis this is represented by a zset (sorted set).)
//
// - The object info table, indexed by OI:object_id, which is a hashmap of:
// "hash" -> the hash of the object,
// "data_size" -> the size of the object in bytes,
// "task" -> the task ID that generated this object.
// "is_put" -> 0 or 1.
//
// == TASK TABLE ==
//
// TODO(pcm): Fill this out.
//

#define OBJECT_INFO_PREFIX "OI:"
#define OBJECT_LOCATION_PREFIX "OL:"
93 changes: 93 additions & 0 deletions test/monitor_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import multiprocessing
import subprocess
import time
import unittest

import ray


class MonitorTest(unittest.TestCase):
def _testCleanupOnDriverExit(self, num_redis_shards):
stdout = subprocess.check_output([
"ray",
"start",
"--head",
"--num-redis-shards",
str(num_redis_shards),
]).decode("ascii")
lines = [m.strip() for m in stdout.split("\n")]
init_cmd = [m for m in lines if m.startswith("ray.init")]
self.assertEqual(1, len(init_cmd))
redis_address = init_cmd[0].split("redis_address=\"")[-1][:-2]

def StateSummary():
obj_tbl_len = len(ray.global_state.object_table())
task_tbl_len = len(ray.global_state.task_table())
func_tbl_len = len(ray.global_state.function_table())
return obj_tbl_len, task_tbl_len, func_tbl_len

def Driver(success):
success.value = True
# Start driver.
ray.init(redis_address=redis_address)
summary_start = StateSummary()
if (0, 1) != summary_start[:2]:
success.value = False

# Two new objects.
ray.get(ray.put(1111))
ray.get(ray.put(1111))
if (2, 1, summary_start[2]) != StateSummary():
success.value = False

@ray.remote
def f():
ray.put(1111) # Yet another object.
return 1111 # A returned object as well.

# 1 new function.
if (2, 1, summary_start[2] + 1) != StateSummary():
success.value = False

ray.get(f.remote())
if (4, 2, summary_start[2] + 1) != StateSummary():
success.value = False

ray.worker.cleanup()

success = multiprocessing.Value('b', False)
driver = multiprocessing.Process(target=Driver, args=(success, ))
driver.start()
# Wait for client to exit.
driver.join()
time.sleep(5)

# Just make sure Driver() is run and succeeded. Note(rkn), if the below
# assertion starts failing, then the issue may be that the summary
# values computed in the Driver function are being updated slowly and
# so the call to StateSummary() is getting outdated values. This could
# be fixed by looping until StateSummary() returns the desired values.
self.assertTrue(success.value)
# Check that objects, tasks, and functions are cleaned up.
ray.init(redis_address=redis_address)
# The assertion below can fail if the monitor is too slow to clean up
# the global state.
self.assertEqual((0, 1), StateSummary()[:2])

ray.worker.cleanup()
subprocess.Popen(["ray", "stop"]).wait()

def testCleanupOnDriverExitSingleRedisShard(self):
self._testCleanupOnDriverExit(num_redis_shards=1)

def testCleanupOnDriverExitManyRedisShards(self):
self._testCleanupOnDriverExit(num_redis_shards=5)
self._testCleanupOnDriverExit(num_redis_shards=31)


if __name__ == "__main__":
unittest.main(verbosity=2)
324 changes: 206 additions & 118 deletions test/runtest.py

Large diffs are not rendered by default.

0 comments on commit 5a50e80

Please sign in to comment.