Adds support for cancellation when closing a gRPC Session.

mrry · tensorflower-gardener · commit e876504fe2cd · 2016-05-16T12:52:08.000-07:00
This brings the behavior of the gRPC Session in line with the
in-process session.
Change: 122447672
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
@@ -162,6 +162,9 @@ class MasterSession : public MasterSessionInterface {
   // nodes) are unique across all sub-graphs within this session.
   int64 next_node_id_ GUARDED_BY(mu_) = 0;
 
+  // Used to cancel running steps on Close().
+  CancellationManager* cancellation_manager_;
+
   // Private dtor. The client must call Close().
   virtual ~MasterSession();
 
@@ -219,7 +222,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                        int64 execution_count,
                        SimpleGraphExecutionState* execution_state,
                        PerStepState* pss, CallOptions* opts,
-                       const RunStepRequest& req, RunStepResponse* resp);
+                       const RunStepRequest& req, RunStepResponse* resp,
+                       CancellationManager* cm);
 
   // Calls workers to cleanup states for the step "step_id".  Waits
   // till all cleanup rpcs complete.
@@ -504,7 +508,8 @@ class RunManyGraphs {
 Status MasterSession::ReffedClientGraph::RunPartitions(
     const MasterEnv* env, int64 step_id, int64 execution_count,
     SimpleGraphExecutionState* execution_state, PerStepState* pss,
-    CallOptions* call_opts, const RunStepRequest& req, RunStepResponse* resp) {
+    CallOptions* call_opts, const RunStepRequest& req, RunStepResponse* resp,
+    CancellationManager* cm) {
   VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
           << execution_count;
   // Builds an index for feeds provided by the client.
@@ -560,7 +565,14 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
 
   // Waits for the RunGraph calls.
   call_opts->SetCancelCallback([&calls]() { calls.StartCancel(); });
+  auto token = cm->get_cancellation_token();
+  bool success =
+      cm->RegisterCallback(token, [&calls]() { calls.StartCancel(); });
+  if (!success) {
+    return errors::Cancelled("Step was cancelled");
+  }
   calls.Wait();
+  cm->DeregisterCallback(token);
   call_opts->ClearCancelCallback();
 
   // Collects fetches.
@@ -696,7 +708,8 @@ MasterSession::MasterSession(const SessionOptions& opt, const MasterEnv* env,
       env_(env),
       handle_(strings::FpToString(random::New64())),
       graph_version_(0),
-      runs_(5) {
+      runs_(5),
+      cancellation_manager_(new CancellationManager) {
   UpdateLastAccessTime();
 
   swap(remote_devs_, *remote_devs);
@@ -717,6 +730,7 @@ MasterSession::MasterSession(const SessionOptions& opt, const MasterEnv* env,
 }
 
 MasterSession::~MasterSession() {
+  delete cancellation_manager_;
   for (const auto& iter : runs_) iter.second->Unref();
   for (const auto& iter : obsolete_) iter.second->Unref();
   delete flib_def_;
@@ -892,8 +906,9 @@ Status MasterSession::DoRunWithLocalExecution(CallOptions* opts,
   const uint64 step_id = (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
   TRACEPRINTF("stepid %llu", step_id);
 
-  TF_RETURN_IF_ERROR(rcg->RunPartitions(
-      env_, step_id, count, execution_state_.get(), &pss, opts, *req, resp));
+  TF_RETURN_IF_ERROR(rcg->RunPartitions(env_, step_id, count,
+                                        execution_state_.get(), &pss, opts,
+                                        *req, resp, cancellation_manager_));
 
   pss.end_micros = Env::Default()->NowMicros();
 
@@ -914,6 +929,7 @@ Status MasterSession::DoRunWithLocalExecution(CallOptions* opts,
 }
 
 Status MasterSession::Close() {
+  cancellation_manager_->StartCancel();
   std::vector<ReffedClientGraph*> to_unref;
   {
     mutex_lock l(mu_);
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
@@ -17,6 +17,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import time
+
 import numpy as np
 import tensorflow as tf
 
@@ -79,6 +81,27 @@ def testLargeFeed(self):
       self.assertEqual(0.5, min_val)
       self.assertEqual(0.5, max_val)
 
+  def testCloseCancelsBlockingOperation(self):
+    server = tf.train.Server.create_local_server()
+    sess = tf.Session(server.target)
+
+    q = tf.FIFOQueue(10, [tf.float32])
+    enqueue_op = q.enqueue(37.0)
+    dequeue_t = q.dequeue()
+
+    sess.run(enqueue_op)
+    sess.run(dequeue_t)
+
+    def blocking_dequeue():
+      with self.assertRaises(tf.errors.CancelledError):
+        sess.run(dequeue_t)
+
+    blocking_thread = self.checkedThread(blocking_dequeue)
+    blocking_thread.start()
+    time.sleep(0.5)
+    sess.close()
+    blocking_thread.join()
+
   def testInvalidHostname(self):
     with self.assertRaisesRegexp(tf.errors.InvalidArgumentError, "port"):
       _ = tf.train.Server({"local": ["localhost"]},