TLipede
diff --git a/‎tensorflow/compiler/jit/kernels/xla_local_launch_op.h
Lines changed: 3 additions & 2 deletions b/‎tensorflow/compiler/jit/kernels/xla_local_launch_op.h
Lines changed: 3 additions & 2 deletions
diff --git a/‎tensorflow/compiler/xla/client/local_client.cc
Lines changed: 23 additions & 12 deletions b/‎tensorflow/compiler/xla/client/local_client.cc
Lines changed: 23 additions & 12 deletions
diff --git a/‎tensorflow/compiler/xla/client/local_client.h
Lines changed: 1 addition & 20 deletions b/‎tensorflow/compiler/xla/client/local_client.h
Lines changed: 1 addition & 20 deletions
diff --git a/‎tensorflow/compiler/xla/executable_run_options.cc
Lines changed: 10 additions & 0 deletions b/‎tensorflow/compiler/xla/executable_run_options.cc
Lines changed: 10 additions & 0 deletions
diff --git a/‎tensorflow/compiler/xla/executable_run_options.h
Lines changed: 6 additions & 0 deletions b/‎tensorflow/compiler/xla/executable_run_options.h
Lines changed: 6 additions & 0 deletions
@@ -31,8 +31,9 @@ namespace tensorflow {
 // Once all inputs are present, and their shapes are known, the op can
 // use a 'XlaCompilationCache' to compile and execute code which is specific
 // to the shapes of input Tensors.
-// XlaLocalLaunchOp uses xla::LocalClient::ExecuteLocally and passes
-// arguments into/out of XLA in device memory.
+// XlaLocalLaunchOp uses xla::LocalClient::Compile() and
+// xla::LocalExecutable::Run(), and passes arguments into/out of XLA in device
+// memory.
 class XlaLocalLaunchOp : public OpKernel {
  public:
   explicit XlaLocalLaunchOp(OpKernelConstruction* ctx);
 
@@ -87,7 +87,7 @@ LocalExecutable::LocalExecutable(std::unique_ptr<Executable> executable,
 
 tensorflow::Status LocalExecutable::ValidateExecutionOptions(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const ExecutableRunOptions& options) {
+    const ExecutableRunOptions& options, const Backend& backend) {
   const ComputationLayout& computation_layout =
       executable_->module_config().entry_computation_layout();
 
@@ -156,13 +156,24 @@ tensorflow::Status LocalExecutable::ValidateExecutionOptions(
         run_executor->GetDeviceDescription().name().c_str());
   }
 
+  if (!options.allocator()) {
+    return InvalidArgument("an allocator must be provided to ExecuteLocally");
+  }
+
+  if (options.allocator()->platform() != backend.platform()) {
+    return InvalidArgument(
+        "allocator platform (%s) does not match service platform (%s)",
+        options.allocator()->platform()->Name().c_str(),
+        backend.platform()->Name().c_str());
+  }
+
   return tensorflow::Status::OK();
 }
 
 StatusOr<std::unique_ptr<ShapedBuffer>> LocalExecutable::Run(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     const ExecutableRunOptions& options) {
-  TF_RETURN_IF_ERROR(ValidateExecutionOptions(arguments, options));
+  TF_RETURN_IF_ERROR(ValidateExecutionOptions(arguments, options, *backend_));
 
   ExecutableRunOptions actual_options = options;
   Backend::StreamPtr stream;
@@ -180,8 +191,16 @@ StatusOr<std::unique_ptr<ShapedBuffer>> LocalExecutable::Run(
   if (executable_->dumping()) {
     return ExecuteAndDump(&service_options, arguments);
   }
-  return executable_->ExecuteOnStream(&service_options, arguments,
-                                      /*hlo_execution_profile=*/nullptr);
+  return Service::ExecuteOnStreamWrapper<
+      StatusOr<std::unique_ptr<ShapedBuffer>>>(
+      executable_.get(), &service_options, options.execution_profile(),
+      backend_,
+      [&arguments](Executable* executable,
+                   const ServiceExecutableRunOptions* run_options,
+                   HloExecutionProfile* hlo_execution_profile) {
+        return executable->ExecuteOnStream(run_options, arguments,
+                                           hlo_execution_profile);
+      });
 }
 
 StatusOr<std::unique_ptr<ShapedBuffer>> LocalExecutable::ExecuteAndDump(
@@ -242,14 +261,6 @@ tensorflow::Status LocalClient::ResolveArguments(
                                           argument_ptrs);
 }
 
-StatusOr<std::unique_ptr<ShapedBuffer>> LocalClient::ExecuteLocally(
-    const Computation& computation,
-    const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const LocalExecuteOptions& options) {
-  return local_service_->ExecuteLocally(computation.handle(), arguments,
-                                        options);
-}
-
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 LocalClient::CompileAheadOfTime(
     const tensorflow::gtl::ArraySlice<AheadOfTimeComputationInstance>
 
@@ -111,7 +111,7 @@ class LocalExecutable {
   // of the computation.
   tensorflow::Status ValidateExecutionOptions(
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const ExecutableRunOptions& options);
+      const ExecutableRunOptions& options, const Backend& backend);
 
   // Records the computation in a SessionModule proto with the arguments used to
   // invoke it, and the result. Enabled by flag: --tla_dump_executions_to.
@@ -175,25 +175,6 @@ class LocalClient : public Client {
       const Shape& shape, int device_ordinal,
       bool allocate_space_for_deep_copy);
 
-  // Executes the given computation with the given arguments and
-  // options. Arguments and result are "zero-copy", and are passed as pointers
-  // to device memory. See LocalExecuteOptions class comments for description of
-  // available options. The returned ShapedBuffer includes pointer(s) to device
-  // memory (DeviceMemoryBase) which are the caller's responsibility to
-  // deallocate. The layout of the result is chosen by the XLA service and
-  // should not be relied upon to be a specific value. If a specific result
-  // layout is needed, then the layout should be set in options.
-  //
-  // The arrays of arguments with different shapes or layouts are assumed not to
-  // alias.
-  //
-  // TODO(b/31220873): Remove ExecuteLocally methods. The path forward is to use
-  // Compile and run the returned LocalExecutable.
-  StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteLocally(
-      const Computation& computation,
-      const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const LocalExecuteOptions& options);
-
   // Build and return a LocalExecutable object. The executable is compiled using
   // the given argument layouts and options.
   StatusOr<std::unique_ptr<LocalExecutable>> Compile(
 
@@ -67,4 +67,14 @@ const Eigen::ThreadPoolDevice* ExecutableRunOptions::intra_op_thread_pool()
   return intra_op_thread_pool_;
 }
 
+ExecutableRunOptions& ExecutableRunOptions::set_execution_profile(
+    ExecutionProfile* profile) {
+  execution_profile_ = profile;
+  return *this;
+}
+
+ExecutionProfile* ExecutableRunOptions::execution_profile() const {
+  return execution_profile_;
+}
+
 }  // namespace xla
@@ -40,6 +40,7 @@ struct ThreadPoolDevice;
 namespace xla {
 
 class DeviceMemoryAllocator;
+class ExecutionProfile;
 
 // Class containing options for running a LocalExecutable.
 class ExecutableRunOptions {
@@ -74,12 +75,17 @@ class ExecutableRunOptions {
       const Eigen::ThreadPoolDevice* intra_op_thread_pool);
   const Eigen::ThreadPoolDevice* intra_op_thread_pool() const;
 
+  // If set, profiling information is written to 'profile'.
+  ExecutionProfile* execution_profile() const;
+  ExecutableRunOptions& set_execution_profile(ExecutionProfile* profile);
+
  private:
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
   perftools::gputools::Stream* stream_ = nullptr;
   tensorflow::thread::ThreadPool* inter_op_thread_pool_ = nullptr;
   const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr;
+  ExecutionProfile* execution_profile_ = nullptr;
 };
 
 }  // namespace xla