Make Monitor remove dead Redis entries from exiting drivers. (#994)

* WIP: removing OL, OI, TT on client exit; no saving yet. * ray_redis_module.cc: update header comment. * Cleanup: just the removal. * Reformat via yapf: use pep8 style instead of google. * Checkpoint addressing comments (partially) * Add 'b' marker before strings (py3 compat) * Add MonitorTest. * Use `isort` to sort imports. * Remove some loggings * Fix flake8 noqa marker runtest.py * Try to separate tests out to monitor_test.py * Rework cleanup algorithm: correct logic * Extend tests to cover multi-shard cases * Add some small comments and formatting changes.
ray-project · Sep 26, 2017 · 5a50e80 · 5a50e80
1 parent 6e9657e
commit 5a50e80
Showing 7 changed files with 698 additions and 174 deletions.
diff --git a/.style.yapf b/.style.yapf
@@ -0,0 +1,190 @@
+[style]
+# Align closing bracket with visual indentation.
+align_closing_bracket_with_visual_indent=True
+
+# Allow dictionary keys to exist on multiple lines. For example:
+#
+#   x = {
+#       ('this is the first element of a tuple',
+#        'this is the second element of a tuple'):
+#            value,
+#   }
+allow_multiline_dictionary_keys=False
+
+# Allow lambdas to be formatted on more than one line.
+allow_multiline_lambdas=False
+
+# Insert a blank line before a class-level docstring.
+blank_line_before_class_docstring=False
+
+# Insert a blank line before a 'def' or 'class' immediately nested
+# within another 'def' or 'class'. For example:
+#
+#   class Foo:
+#                      # <------ this blank line
+#     def method():
+#       ...
+blank_line_before_nested_class_or_def=False
+
+# Do not split consecutive brackets. Only relevant when
+# dedent_closing_brackets is set. For example:
+#
+#    call_func_that_takes_a_dict(
+#        {
+#            'key1': 'value1',
+#            'key2': 'value2',
+#        }
+#    )
+#
+# would reformat to:
+#
+#    call_func_that_takes_a_dict({
+#        'key1': 'value1',
+#        'key2': 'value2',
+#    })
+coalesce_brackets=False
+
+# The column limit.
+column_limit=79
+
+# Indent width used for line continuations.
+continuation_indent_width=4
+
+# Put closing brackets on a separate line, dedented, if the bracketed
+# expression can't fit in a single line. Applies to all kinds of brackets,
+# including function definitions and calls. For example:
+#
+#   config = {
+#       'key1': 'value1',
+#       'key2': 'value2',
+#   }        # <--- this bracket is dedented and on a separate line
+#
+#   time_series = self.remote_client.query_entity_counters(
+#       entity='dev3246.region1',
+#       key='dns.query_latency_tcp',
+#       transform=Transformation.AVERAGE(window=timedelta(seconds=60)),
+#       start_ts=now()-timedelta(days=3),
+#       end_ts=now(),
+#   )        # <--- this bracket is dedented and on a separate line
+dedent_closing_brackets=False
+
+# Place each dictionary entry onto its own line.
+each_dict_entry_on_separate_line=True
+
+# The regex for an i18n comment. The presence of this comment stops
+# reformatting of that line, because the comments are required to be
+# next to the string they translate.
+i18n_comment=
+
+# The i18n function call names. The presence of this function stops
+# reformattting on that line, because the string it has cannot be moved
+# away from the i18n comment.
+i18n_function_call=
+
+# Indent the dictionary value if it cannot fit on the same line as the
+# dictionary key. For example:
+#
+#   config = {
+#       'key1':
+#           'value1',
+#       'key2': value1 +
+#               value2,
+#   }
+indent_dictionary_value=False
+
+# The number of columns to use for indentation.
+indent_width=4
+
+# Join short lines into one line. E.g., single line 'if' statements.
+join_multiple_lines=True
+
+# Do not include spaces around selected binary operators. For example:
+#
+#   1 + 2 * 3 - 4 / 5
+#
+# will be formatted as follows when configured with a value "*,/":
+#
+#   1 + 2*3 - 4/5
+#
+no_spaces_around_selected_binary_operators=set([])
+
+# Use spaces around default or named assigns.
+spaces_around_default_or_named_assign=False
+
+# Use spaces around the power operator.
+spaces_around_power_operator=False
+
+# The number of spaces required before a trailing comment.
+spaces_before_comment=2
+
+# Insert a space between the ending comma and closing bracket of a list,
+# etc.
+space_between_ending_comma_and_closing_bracket=True
+
+# Split before arguments if the argument list is terminated by a
+# comma.
+split_arguments_when_comma_terminated=False
+
+# Set to True to prefer splitting before '&', '|' or '^' rather than
+# after.
+split_before_bitwise_operator=True
+
+# Split before a dictionary or set generator (comp_for). For example, note
+# the split before the 'for':
+#
+#   foo = {
+#       variable: 'Hello world, have a nice day!'
+#       for variable in bar if variable != 42
+#   }
+split_before_dict_set_generator=True
+
+# If an argument / parameter list is going to be split, then split before
+# the first argument.
+split_before_first_argument=False
+
+# Set to True to prefer splitting before 'and' or 'or' rather than
+# after.
+split_before_logical_operator=True
+
+# Split named assignments onto individual lines.
+split_before_named_assigns=True
+
+# The penalty for splitting right after the opening bracket.
+split_penalty_after_opening_bracket=30
+
+# The penalty for splitting the line after a unary operator.
+split_penalty_after_unary_operator=10000
+
+# The penalty for splitting right before an if expression.
+split_penalty_before_if_expr=0
+
+# The penalty of splitting the line around the '&', '|', and '^'
+# operators.
+split_penalty_bitwise_operator=300
+
+# The penalty for characters over the column limit.
+split_penalty_excess_character=4500
+
+# The penalty incurred by adding a line split to the unwrapped line. The
+# more line splits added the higher the penalty.
+split_penalty_for_added_line_split=30
+
+# The penalty of splitting a list of "import as" names. For example:
+#
+#   from a_very_long_or_indented_module_name_yada_yad import (long_argument_1,
+#                                                             long_argument_2,
+#                                                             long_argument_3)
+#
+# would reformat to something like:
+#
+#   from a_very_long_or_indented_module_name_yada_yad import (
+#       long_argument_1, long_argument_2, long_argument_3)
+split_penalty_import_names=0
+
+# The penalty of splitting the line around the 'and' and 'or'
+# operators.
+split_penalty_logical_operator=300
+
+# Use the Tab character for indentation.
+use_tabs=False
+
diff --git a/.travis.yml b/.travis.yml
@@ -117,5 +117,6 @@ script:
   - python test/component_failures_test.py
   - python test/multi_node_test.py
   - python test/recursion_test.py
+  - python test/monitor_test.py
 
   - python -m pytest python/ray/rllib/test/test_catalog.py
diff --git a/python/ray/experimental/state.py b/python/ray/experimental/state.py
@@ -52,12 +52,18 @@
 class GlobalState(object):
     """A class used to interface with the Ray control state.
 
+    # TODO(zongheng): In the future move this to use Ray's redis module in the
+    # backend to cut down on # of request RPCs.
+
     Attributes:
-      redis_client: The redis client used to query the redis server.
+        redis_client: The redis client used to query the redis server.
     """
     def __init__(self):
         """Create a GlobalState object."""
+        # The redis server storing metadata, such as function table, client
+        # table, log files, event logs, workers/actions info.
         self.redis_client = None
+        # A list of redis shards, storing the object table & task table.
         self.redis_clients = None
 
     def _check_connected(self):

diff --git a/python/ray/monitor.py b/python/ray/monitor.py
diff --git a/src/common/redis_module/ray_redis_module.cc b/src/common/redis_module/ray_redis_module.cc
@@ -8,22 +8,25 @@
 
 #include "common_protocol.h"
 
-/**
- * Various tables are maintained in redis:
- *
- * == OBJECT TABLE ==
- *
- * This consists of two parts:
- * - The object location table, indexed by OL:object_id, which is the set of
- *   plasma manager indices that have access to the object.
- * - The object info table, indexed by OI:object_id, which is a hashmap with key
- *   "hash" for the hash of the object and key "data_size" for the size of the
- *   object in bytes.
- *
- * == TASK TABLE ==
- *
- * TODO(pcm): Fill this out.
- */
+// Various tables are maintained in redis:
+//
+// == OBJECT TABLE ==
+//
+// This consists of two parts:
+// - The object location table, indexed by OL:object_id, which is the set of
+//   plasma manager indices that have access to the object.
+//   (In redis this is represented by a zset (sorted set).)
+//
+// - The object info table, indexed by OI:object_id, which is a hashmap of:
+//     "hash" -> the hash of the object,
+//     "data_size" -> the size of the object in bytes,
+//     "task" -> the task ID that generated this object.
+//     "is_put" -> 0 or 1.
+//
+// == TASK TABLE ==
+//
+// TODO(pcm): Fill this out.
+//
 
 #define OBJECT_INFO_PREFIX "OI:"
 #define OBJECT_LOCATION_PREFIX "OL:"

diff --git a/test/monitor_test.py b/test/monitor_test.py
@@ -0,0 +1,93 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing
+import subprocess
+import time
+import unittest
+
+import ray
+
+
+class MonitorTest(unittest.TestCase):
+    def _testCleanupOnDriverExit(self, num_redis_shards):
+        stdout = subprocess.check_output([
+            "ray",
+            "start",
+            "--head",
+            "--num-redis-shards",
+            str(num_redis_shards),
+        ]).decode("ascii")
+        lines = [m.strip() for m in stdout.split("\n")]
+        init_cmd = [m for m in lines if m.startswith("ray.init")]
+        self.assertEqual(1, len(init_cmd))
+        redis_address = init_cmd[0].split("redis_address=\"")[-1][:-2]
+
+        def StateSummary():
+            obj_tbl_len = len(ray.global_state.object_table())
+            task_tbl_len = len(ray.global_state.task_table())
+            func_tbl_len = len(ray.global_state.function_table())
+            return obj_tbl_len, task_tbl_len, func_tbl_len
+
+        def Driver(success):
+            success.value = True
+            # Start driver.
+            ray.init(redis_address=redis_address)
+            summary_start = StateSummary()
+            if (0, 1) != summary_start[:2]:
+                success.value = False
+
+            # Two new objects.
+            ray.get(ray.put(1111))
+            ray.get(ray.put(1111))
+            if (2, 1, summary_start[2]) != StateSummary():
+                success.value = False
+
+            @ray.remote
+            def f():
+                ray.put(1111)  # Yet another object.
+                return 1111  # A returned object as well.
+
+            # 1 new function.
+            if (2, 1, summary_start[2] + 1) != StateSummary():
+                success.value = False
+
+            ray.get(f.remote())
+            if (4, 2, summary_start[2] + 1) != StateSummary():
+                success.value = False
+
+            ray.worker.cleanup()
+
+        success = multiprocessing.Value('b', False)
+        driver = multiprocessing.Process(target=Driver, args=(success, ))
+        driver.start()
+        # Wait for client to exit.
+        driver.join()
+        time.sleep(5)
+
+        # Just make sure Driver() is run and succeeded. Note(rkn), if the below
+        # assertion starts failing, then the issue may be that the summary
+        # values computed in the Driver function are being updated slowly and
+        # so the call to StateSummary() is getting outdated values. This could
+        # be fixed by looping until StateSummary() returns the desired values.
+        self.assertTrue(success.value)
+        # Check that objects, tasks, and functions are cleaned up.
+        ray.init(redis_address=redis_address)
+        # The assertion below can fail if the monitor is too slow to clean up
+        # the global state.
+        self.assertEqual((0, 1), StateSummary()[:2])
+
+        ray.worker.cleanup()
+        subprocess.Popen(["ray", "stop"]).wait()
+
+    def testCleanupOnDriverExitSingleRedisShard(self):
+        self._testCleanupOnDriverExit(num_redis_shards=1)
+
+    def testCleanupOnDriverExitManyRedisShards(self):
+        self._testCleanupOnDriverExit(num_redis_shards=5)
+        self._testCleanupOnDriverExit(num_redis_shards=31)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)
diff --git a/test/runtest.py b/test/runtest.py