PaddlePaddle
diff --git a/‎paddle/fluid/distributed/communicator_common.h‎
Lines changed: 7 additions & 2 deletions b/‎paddle/fluid/distributed/communicator_common.h‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎paddle/fluid/distributed/fleet.cc‎
Lines changed: 5 additions & 4 deletions b/‎paddle/fluid/distributed/fleet.cc‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎paddle/fluid/distributed/fleet.h‎
Lines changed: 4 additions & 2 deletions b/‎paddle/fluid/distributed/fleet.h‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎paddle/fluid/distributed/ps.proto‎
Lines changed: 5 additions & 6 deletions b/‎paddle/fluid/distributed/ps.proto‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎paddle/fluid/distributed/service/brpc_ps_client.cc‎
Lines changed: 28 additions & 0 deletions b/‎paddle/fluid/distributed/service/brpc_ps_client.cc‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/service/brpc_ps_client.h‎
Lines changed: 3 additions & 1 deletion b/‎paddle/fluid/distributed/service/brpc_ps_client.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎paddle/fluid/distributed/service/brpc_ps_server.cc‎
Lines changed: 22 additions & 0 deletions b/‎paddle/fluid/distributed/service/brpc_ps_server.cc‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/service/brpc_ps_server.h‎
Lines changed: 3 additions & 0 deletions b/‎paddle/fluid/distributed/service/brpc_ps_server.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎paddle/fluid/distributed/service/communicator.cc‎
Lines changed: 53 additions & 3 deletions b/‎paddle/fluid/distributed/service/communicator.cc‎
Lines changed: 53 additions & 3 deletions
diff --git a/‎paddle/fluid/distributed/service/communicator.h‎
Lines changed: 3 additions & 4 deletions b/‎paddle/fluid/distributed/service/communicator.h‎
Lines changed: 3 additions & 4 deletions
@@ -30,7 +30,8 @@ struct CommContext {
               const std::vector<int64_t> &sections,
               const std::vector<std::string> &origin_names, int id,
               bool merge_add_ = true, bool is_sparse_ = true,
-              bool is_distributed_ = false, int table_id_ = -1)
+              bool is_distributed_ = false, int table_id_ = -1,
+              bool is_tensor_table_ = false)
       : var_name(name),
         splited_varnames(names),
         epmap(emap),
@@ -40,7 +41,8 @@ struct CommContext {
         merge_add(merge_add_),
         is_sparse(is_sparse_),
         is_distributed(is_distributed_),
-        table_id(table_id_) {}
+        table_id(table_id_),
+        is_tensor_table(is_tensor_table_) {}
 
   CommContext(const CommContext &ctx) {
     var_name = ctx.var_name;
@@ -53,6 +55,7 @@ struct CommContext {
     origin_varnames = ctx.origin_varnames;
     is_distributed = ctx.is_distributed;
     table_id = ctx.table_id;
+    is_tensor_table = ctx.is_tensor_table;
   }
 
   std::string print() const {
@@ -75,6 +78,7 @@ struct CommContext {
     ss << " is_sparse: " << is_sparse;
     ss << " is_distributed: " << is_distributed << "\n";
     ss << " table_id: " << table_id << "\n";
+    ss << " is_tensor_table: " << is_tensor_table << "\n";
 
     return ss.str();
   }
@@ -89,6 +93,7 @@ struct CommContext {
   bool is_sparse;
   bool is_distributed;
   int table_id;
+  bool is_tensor_table;
 };
 
 }  // namespace distributed
 
@@ -53,15 +53,16 @@ void FleetWrapper::LoadSparseOnServer(const std::string& path,
   pserver_ptr_->_server_ptr->table(table_id)->load(path, meta);
 }
 
-void FleetWrapper::InitServer(const std::string& dist_desc,
-                              const std::vector<std::string>& host_sign_list,
-                              int index) {
+void FleetWrapper::InitServer(
+    const std::string& dist_desc,
+    const std::vector<std::string>& host_sign_list, int index,
+    const std::vector<framework::ProgramDesc>& server_sub_program) {
   if (!is_initialized_) {
     VLOG(3) << "Going to init server";
     pserver_ptr_ = std::shared_ptr<paddle::distributed::PSCore>(
         new paddle::distributed::PSCore());
     pserver_ptr_->init_server(dist_desc, &host_sign_list, host_sign_list.size(),
-                              index);
+                              index, server_sub_program);
     is_initialized_ = true;
   } else {
     VLOG(3) << "Server can be initialized only once";
 
@@ -154,8 +154,10 @@ class FleetWrapper {
   // init server
   // void InitServer(const std::string& dist_desc,
   //                 const std::vector<uint64_t>& host_sign_list, int index);
-  void InitServer(const std::string& dist_desc,
-                  const std::vector<std::string>& host_sign_list, int index);
+  void InitServer(
+      const std::string& dist_desc,
+      const std::vector<std::string>& host_sign_list, int index,
+      const std::vector<framework::ProgramDesc>& server_sub_program = {});
   // init trainer
   void InitWorker(const std::string& dist_desc,
                   const std::vector<std::string>& host_sign_list, Scope* scope,
 
@@ -126,12 +126,11 @@ message TableAccessorParameter {
 }
 
 message TensorAccessorParameter {
-  optional string tensor_class = 1;
-  optional uint32 fea_dim = 2;
-  optional uint32 emb_dim = 3;
-  optional string param = 4;
-  optional string grad = 5;
-  optional string common_block_map = 6;
+  optional string feed_var_name = 1;
+  optional string fetch_var_name = 2;
+  optional int64 startup_program_id = 3;
+  optional int64 main_program_id = 4;
+  optional string tensor_table_class = 6;
 }
 
 message CommonAccessorParameter {
 
@@ -719,6 +719,34 @@ std::future<int32_t> BrpcPsClient::push_dense_raw_gradient(
   return fut;
 }
 
+std::future<int32_t> BrpcPsClient::push_global_step(int table_id,
+                                                    int64_t *total_send_data,
+                                                    void *done) {
+  size_t request_call_num = _server_channels.size();
+  DownpourBrpcClosure *closure = reinterpret_cast<DownpourBrpcClosure *>(done);
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  for (size_t i = 0; i < request_call_num; ++i) {
+    closure->request(i)->set_cmd_id(PS_PUSH_GLOBAL_STEP);
+    closure->request(i)->set_table_id(table_id);
+    closure->request(i)->set_client_id(_client_id);
+    auto *push_data = closure->request(i)->mutable_data();
+    push_data->clear();
+    int32_t num_per_shard = 1;
+    push_data->resize(sizeof(uint32_t) + num_per_shard * sizeof(int64_t));
+    char *push_data_ptr = const_cast<char *>(push_data->data());
+    memcpy(push_data_ptr, &num_per_shard, sizeof(uint32_t));
+    memcpy(push_data_ptr + sizeof(uint32_t), total_send_data,
+           num_per_shard * sizeof(int64_t));
+
+    PsService_Stub rpc_stub(get_dense_channel(i));
+    rpc_stub.service(closure->cntl(i), closure->request(i),
+                     closure->response(i), closure);
+  }
+  return fut;
+}
+
 std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
                                                size_t table_id,
                                                const uint64_t *keys,
 
@@ -140,7 +140,9 @@ class BrpcPsClient : public PSClient {
                                               std::vector<float> *values,
                                               std::vector<uint64_t> *keys,
                                               int pserver_idx);
-
+  virtual std::future<int32_t> push_global_step(int table_id,
+                                                int64_t *total_send_data,
+                                                void *done);
   virtual std::future<int32_t> flush();
 
   virtual std::future<int32_t> send_client2client_msg(
 
@@ -100,6 +100,7 @@ int32_t PsService::initialize() {
   _service_handler_map[PS_BARRIER] = &PsService::barrier;
   _service_handler_map[PS_START_PROFILER] = &PsService::start_profiler;
   _service_handler_map[PS_STOP_PROFILER] = &PsService::stop_profiler;
+  _service_handler_map[PS_PUSH_GLOBAL_STEP] = &PsService::push_global_step;
 
   // shard初始化,server启动后才可从env获取到server_list的shard信息
   initialize_shard_info();
@@ -526,5 +527,26 @@ int32_t PsService::start_profiler(Table *table, const PsRequestMessage &request,
   return 0;
 }
 
+int32_t PsService::push_global_step(Table *table,
+                                    const PsRequestMessage &request,
+                                    PsResponseMessage &response,
+                                    brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response);
+  auto req_buffer_size = request.data().size();
+  if (req_buffer_size < 1) {
+    set_response_code(response, 0, "run_program data is empty");
+    return 0;
+  }
+  uint32_t num = *(const uint32_t *)(request.data().data());
+  const int64_t *values =
+      (const int64_t *)(request.data().data() + sizeof(uint32_t));
+  auto trainer_id = request.client_id();
+  if (table->push_dense(values, trainer_id) != 0) {
+    set_response_code(response, -1, "run_program failed");
+  }
+
+  return 0;
+}
+
 }  // namespace distributed
 }  // namespace paddle
@@ -110,6 +110,9 @@ class PsService : public PsBaseService {
   int32_t print_table_stat(Table *table, const PsRequestMessage &request,
                            PsResponseMessage &response, brpc::Controller *cntl);
 
+  int32_t push_global_step(Table *table, const PsRequestMessage &request,
+                           PsResponseMessage &response, brpc::Controller *cntl);
+
   bool _is_initialize_shard_info;
   std::mutex _initialize_shard_mutex;
   std::unordered_map<int32_t, serviceHandlerFunc> _service_handler_map;
 
@@ -34,6 +34,9 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
 
+#define LEARNING_RATE_DECAY_COUNTER "@LR_DECAY_COUNTER@"
+#define STEP_COUNTER "@PS_STEP_COUNTER@"
+
 namespace paddle {
 namespace distributed {
 
@@ -377,6 +380,37 @@ void Communicator::RpcProfilerControl() {
   }
 }
 
+void Communicator::SendGlobalStep(const CommContext &ctx, int batches,
+                                  Scope *send_scope) {
+  if (batches == 0) {
+    return;
+  }
+  auto &table_id = ctx.table_id;
+  size_t request_call_num = _worker_ptr->get_server_nums();
+
+  auto &var_name = STEP_COUNTER;
+  auto *out_var = send_scope->Var(var_name);
+  auto *out_t = out_var->GetMutable<framework::LoDTensor>();
+  auto *data = out_t->mutable_data<int64_t>({1}, platform::CPUPlace());
+  data[0] = static_cast<int64_t>(batches);
+  VLOG(3) << "Communicator::SendGlobalStep send: " << batches;
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num, [this, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        for (size_t i = 0; i < request_call_num; ++i) {
+          if (closure->check_response(i, PS_PUSH_GLOBAL_STEP) != 0) {
+            ret = -1;
+            break;
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+  auto status = _worker_ptr->push_global_step(table_id, data, closure);
+  status.wait();
+  return;
+}
+
 void AsyncCommunicator::RecvThread() {
   if (!independent_recv_) return;
   VLOG(3) << "Independent RecvThread Start and Wait";
@@ -465,10 +499,16 @@ void AsyncCommunicator::SendByCommunicator() {
 
       for (size_t i = 0; i < var_nums; i++) {
         auto &var_name = varnames[i];
-        MergeVars<float>(var_name, vars[i], send_scope_.get(), 1);
+        if (var_name == STEP_COUNTER) {
+          MergeVars<int64_t>(var_name, vars[i], send_scope_.get(), 1);
+        } else {
+          MergeVars<float>(var_name, vars[i], send_scope_.get(), 1);
+        }
       }
 
-      if (ctx.is_sparse) {
+      if (ctx.is_tensor_table) {
+        SendGlobalStep(ctx, merged_var_num, send_scope_.get());
+      } else if (ctx.is_sparse) {
         PADDLE_ENFORCE_EQ(
             varnames.size(), 1,
             platform::errors::InvalidArgument(
@@ -599,8 +639,18 @@ bool AsyncCommunicator::Check(const std::vector<std::string> &var_tables) {
       platform::errors::InvalidArgument("var_tables.size() == 1 is permitted"));
 
   auto table_name = var_tables[0];
-  if (send_varname_to_ctx_.find(table_name) == send_varname_to_ctx_.end())
+  if (send_varname_to_ctx_.find(table_name) == send_varname_to_ctx_.end()) {
     return false;
+  }
+  if (table_name == STEP_COUNTER) {
+    VLOG(3) << "send step_counter into queue";
+    auto tmp_var = std::make_shared<Variable>();
+    auto *tensor = tmp_var->GetMutable<framework::LoDTensor>();
+    tensor->Resize(framework::make_ddim({1}));
+    auto *out_d = tensor->mutable_data<int64_t>(platform::CPUPlace());
+    out_d[0] = 1;
+    send_varname_to_queue_[table_name]->Push(tmp_var);
+  }
   return true;
 }
 
 
@@ -223,6 +223,9 @@ class Communicator {
   // 6. recv sparse param
   virtual void RpcRecvSparse(const std::string &varname, int table_id,
                              Scope *scope);
+  // 7. send gloabl step
+  virtual void SendGlobalStep(const CommContext &ctx, int batches,
+                              Scope *send_scope);
 
   virtual ~Communicator() {}
   virtual void RpcProfilerControl();
@@ -376,8 +379,6 @@ class AsyncCommunicator : public Communicator {
 
   virtual void SendByCommunicator();
 
-  virtual void SendGlobalStep(int batches) {}
-
   virtual void RecvByCommunicator();
 
   virtual void RecvNoBarrier();
@@ -527,8 +528,6 @@ class GeoCommunicator : public AsyncCommunicator {
 
   void SendByCommunicator() { return; }
 
-  void SendGlobalStep(int batches) override { return; }
-
   void RecvByCommunicator() override { return; }
 
   inline std::string GradToParam(const std::string var_name) {