seemingwang
diff --git a/‎paddle/fluid/framework/data_set.cc‎
Lines changed: 22 additions & 5 deletions b/‎paddle/fluid/framework/data_set.cc‎
Lines changed: 22 additions & 5 deletions
diff --git a/‎paddle/fluid/framework/data_set.h‎
Lines changed: 5 additions & 1 deletion b/‎paddle/fluid/framework/data_set.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎paddle/fluid/framework/fleet/fleet_wrapper.cc‎
Lines changed: 5 additions & 4 deletions b/‎paddle/fluid/framework/fleet/fleet_wrapper.cc‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎paddle/fluid/framework/fleet/heter_context.h‎
Lines changed: 0 additions & 23 deletions b/‎paddle/fluid/framework/fleet/heter_context.h‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎paddle/fluid/framework/fleet/heter_ps/feature_value.h‎
Lines changed: 45 additions & 16 deletions b/‎paddle/fluid/framework/fleet/heter_ps/feature_value.h‎
Lines changed: 45 additions & 16 deletions
diff --git a/‎paddle/fluid/framework/fleet/heter_ps/hashtable.h‎
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/framework/fleet/heter_ps/hashtable.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu‎
Lines changed: 47 additions & 13 deletions b/‎paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu‎
Lines changed: 47 additions & 13 deletions
@@ -121,6 +121,24 @@ void DatasetImpl<T>::SetDataFeedDesc(const std::string& data_feed_desc_str) {
                                                 &data_feed_desc_);
 }
 
+template <typename T>
+std::vector<std::string> DatasetImpl<T>::GetSlots() {
+  auto multi_slot_desc = data_feed_desc_.multi_slot_desc();
+  use_slots_.clear();
+  for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
+    const auto& slot = multi_slot_desc.slots(i);
+    if (slot.type() == "uint64" || slot.type() == "uint32") {
+      use_slots_.push_back(slot.name());
+    }
+  }
+  std::cout << "dataset use slots: ";
+  for (auto s : use_slots_) {
+    std::cout << s << " | ";
+  }
+  std::cout << " end " << std::endl;
+  return use_slots_;
+}
+
 template <typename T>
 void DatasetImpl<T>::SetChannelNum(int channel_num) {
   channel_num_ = channel_num;
@@ -303,12 +321,11 @@ static int compute_thread_batch_nccl(
   thread_avg_batch_num = static_cast<int>(offset.size() / thr_num);
 #ifdef PADDLE_WITH_GLOO
   auto gloo_wrapper = paddle::framework::GlooWrapper::GetInstance();
-  if (!gloo_wrapper->IsInitialized()) {
-    VLOG(0) << "GLOO is not inited";
-    gloo_wrapper->Init();
-  }
-
   if (gloo_wrapper->Size() > 1) {
+    if (!gloo_wrapper->IsInitialized()) {
+      VLOG(0) << "GLOO is not inited";
+      gloo_wrapper->Init();
+    }
     // adjust batch num per thread for NCCL
     std::vector<int> thread_avg_batch_num_vec(1, thread_avg_batch_num);
     std::vector<int64_t> total_instance_num_vec(1, total_instance_num);
 
@@ -152,7 +152,7 @@ class Dataset {
   virtual void DestroyPreLoadReaders() = 0;
   // set preload thread num
   virtual void SetPreLoadThreadNum(int thread_num) = 0;
-  // separate train thread and dataset thread
+  // seperate train thread and dataset thread
   virtual void DynamicAdjustChannelNum(int channel_num,
                                        bool discard_remaining_ins = false) = 0;
   virtual void DynamicAdjustReadersNum(int thread_num) = 0;
@@ -161,6 +161,8 @@ class Dataset {
   virtual void SetGraphDeviceKeys(
       const std::vector<int64_t>& h_device_keys) = 0;
 
+  virtual std::vector<std::string> GetSlots() = 0;
+
  protected:
   virtual int ReceiveFromClient(int msg_type, int client_id,
                                 const std::string& msg) = 0;
@@ -249,6 +251,7 @@ class DatasetImpl : public Dataset {
                                        bool discard_remaining_ins = false);
   virtual void DynamicAdjustReadersNum(int thread_num);
   virtual void SetFleetSendSleepSeconds(int seconds);
+  virtual std::vector<std::string> GetSlots();
   /* for enable_heterps_
   virtual void EnableHeterps(bool enable_heterps) {
     enable_heterps_ = enable_heterps;
@@ -324,6 +327,7 @@ class DatasetImpl : public Dataset {
   int64_t global_index_ = 0;
   std::vector<std::shared_ptr<ThreadPool>> consume_task_pool_;
   std::vector<T> input_records_;  // only for paddleboxdatafeed
+  std::vector<std::string> use_slots_;
   bool enable_heterps_ = false;
   int gpu_graph_mode_ = 1;
   std::vector<std::vector<int64_t>> gpu_graph_device_keys_;
 
@@ -69,7 +69,7 @@ void FleetWrapper::InitWorker(const std::string& dist_desc,
                               int node_num, int index) {
 #ifdef PADDLE_WITH_PSLIB
   if (!is_initialized_) {
-    VLOG(3) << "Going to init worker";
+    VLOG(0) << "Going to init worker";
     pslib_ptr_ = std::shared_ptr<paddle::distributed::PSlib>(
         new paddle::distributed::PSlib());
     pslib_ptr_->init_worker(dist_desc,
@@ -126,7 +126,7 @@ void FleetWrapper::GatherServers(const std::vector<uint64_t>& host_sign_list,
 
 void FleetWrapper::GatherClients(const std::vector<uint64_t>& host_sign_list) {
 #ifdef PADDLE_WITH_PSLIB
-  VLOG(3) << "Going to gather client ips";
+  VLOG(0) << "Going to gather client ips";
   size_t len = host_sign_list.size();
   pslib_ptr_->gather_clients(const_cast<uint64_t*>(host_sign_list.data()), len);
 #endif
@@ -142,7 +142,7 @@ std::vector<uint64_t> FleetWrapper::GetClientsInfo() {
 
 void FleetWrapper::CreateClient2ClientConnection() {
 #ifdef PADDLE_WITH_PSLIB
-  VLOG(3) << "Going to create client2client connection";
+  VLOG(0) << "Going to create client2client connection";
   pslib_ptr_->create_client2client_connection(client2client_request_timeout_ms_,
                                               client2client_connect_timeout_ms_,
                                               client2client_max_retry_);
@@ -1054,7 +1054,8 @@ void FleetWrapper::PushSparseFromTensorWithLabelAsync(
   int slot_offset = 0;
   int grad_dim = 0;
   // don't worry, user do not have to care about all these flags
-  if (accesor == "DownpourCtrAccessor") {
+  if (accesor == "DownpourCtrAccessor" ||
+      accesor == "DownpourCtrDymfAccessor") {
     dump_slot = true;
     slot_offset = 1;
     grad_dim = fea_dim - 2;
 
@@ -95,24 +95,6 @@ class HeterContext {
   }
   void SetShardNum(uint32_t shard_num) { shard_num_ = shard_num; }
   uint32_t ShardNum() { return shard_num_; }
-  void init(int shard_num, int device_num) {
-    shard_num_ = shard_num;
-    feature_keys_.resize(shard_num_);
-    value_ptr_.resize(shard_num_);
-    device_task_ptr_.resize(shard_num_);
-    device_task_keys_.resize(shard_num_);
-    for (size_t i = 0; i < device_task_ptr_.size(); i++) {
-      device_task_ptr_[i].resize(device_num);
-      device_task_keys_[i].resize(device_num);
-    }
-
-    device_values_.resize(device_num);
-    device_keys_.resize(device_num);
-    mutex_.resize(device_num);
-    for (size_t i = 0; i < mutex_.size(); ++i) {
-      mutex_[i] = new std::mutex();
-    }
-  }
 
   void init(int shard_num, int device_num, int dim_num) {
     shard_num_ = shard_num;
@@ -129,11 +111,6 @@ class HeterContext {
     for (size_t i = 0; i < feature_dim_keys_.size(); i++) {
       feature_dim_keys_[i].resize(dim_num);
       value_dim_ptr_[i].resize(dim_num);
-      if (i == 0) {
-        for (int j = 0; j < dim_num; j++) {
-          feature_dim_keys_[i][j].push_back(0);
-        }
-      }
     }
     device_values_.resize(device_num);
     device_dim_values_.resize(device_num);
 
@@ -32,38 +32,67 @@ struct FeatureValue {
   float lr;
   float lr_g2sum;
   int mf_size;
-  float mf[MF_DIM + 1];
+  int mf_dim;
   uint64_t cpu_ptr;
+  float mf[0];
 
   friend std::ostream& operator<<(std::ostream& out, FeatureValue& val) {
     out << "show: " << val.show << " clk: " << val.clk << " slot: " << val.slot
-        << " lr: " << val.lr << " mf_size: " << val.mf_size << " mf:";
-    for (int i = 0; i < val.mf_size; ++i) {
+        << " lr: " << val.lr << " mf_dim: " << val.mf_dim
+        << "cpuptr: " << val.cpu_ptr << " mf_size: " << val.mf_size << " mf:";
+    for (int i = 0; i < val.mf_dim + 1; ++i) {
       out << " " << val.mf[i];
     }
     return out;
   }
+  __device__ __forceinline__ void operator=(const FeatureValue& in) {
+    delta_score = in.delta_score;
+    show = in.show;
+    clk = in.clk;
+    slot = in.slot;
+    lr = in.lr;
+    lr_g2sum = in.lr_g2sum;
+    mf_size = in.mf_size;
+    mf_dim = in.mf_dim;
+    cpu_ptr = in.cpu_ptr;
+    for (int i = 0; i < mf_dim + 1; i++) {
+      mf[i] = in.mf[i];
+    }
+  }
 };
 
 struct FeaturePushValue {
   float show;
   float clk;
   int slot;
   float lr_g;
-  float mf_g[MF_DIM];
+  int mf_dim;
+  float mf_g[0];
 
-  // __device__ __forceinline__ FeaturePushValue
-  // operator+(const FeaturePushValue& a) const {
-  //  FeaturePushValue out;
-  //  out.slot = a.slot;
-  //  out.show = a.show + show;
-  //  out.clk = a.clk + clk;
-  //  out.lr_g = a.lr_g + lr_g;
-  //  for (int i = 0; i < MF_DIM; ++i) {
-  //    out.mf_g[i] = a.mf_g[i] + mf_g[i];
-  //  }
-  //  return out;
-  // }
+  __device__ __forceinline__ FeaturePushValue
+  operator+(const FeaturePushValue& a) const {
+    FeaturePushValue out;
+    out.slot = a.slot;
+    out.mf_dim = a.mf_dim;
+    out.show = a.show + show;
+    out.clk = a.clk + clk;
+    out.lr_g = a.lr_g + lr_g;
+    // out.mf_g = a.mf_g;
+    for (int i = 0; i < out.mf_dim; ++i) {
+      out.mf_g[i] = a.mf_g[i] + mf_g[i];
+    }
+    return out;
+  }
+  __device__ __forceinline__ void operator=(const FeaturePushValue& in) {
+    show = in.show;
+    clk = in.clk;
+    slot = in.slot;
+    lr_g = in.lr_g;
+    mf_dim = in.mf_dim;
+    for (int i = 0; i < mf_dim; i++) {
+      mf_g[i] = in.mf_g[i];
+    }
+  }
 };
 
 }  // end namespace framework
 
@@ -118,8 +118,8 @@ class HashTable {
               StreamType stream);
 
   template <typename StreamType>
-  void insert(const KeyType* d_keys, size_t len, char* pool, size_t start_index,
-              StreamType stream);
+  void insert(const KeyType* d_keys, size_t len, char* pool,
+              size_t feature_value_size, size_t start_index, StreamType stream);
 
   template <typename StreamType>
   void get(const KeyType* d_keys, ValType* d_vals, size_t len,
 
@@ -50,15 +50,17 @@ __global__ void insert_kernel(Table* table,
 template <typename Table>
 __global__ void insert_kernel(Table* table,
                               const typename Table::key_type* const keys,
-                              size_t len, char* pool, int start_index) {
+                              size_t len, char* pool, size_t feature_value_size,
+                              int start_index) {
   ReplaceOp<typename Table::mapped_type> op;
   thrust::pair<typename Table::key_type, typename Table::mapped_type> kv;
 
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (i < len) {
     kv.first = keys[i];
-    kv.second = (Table::mapped_type)(pool + (start_index + i) * 80);
+    uint64_t offset = uint64_t(start_index + i) * feature_value_size;
+    kv.second = (Table::mapped_type)(pool + offset);
     auto it = table->insert(kv, op);
     assert(it != table->end() && "error: insert fails: table is full");
   }
@@ -81,14 +83,29 @@ __global__ void search_kernel(Table* table,
 template <typename Table>
 __global__ void dy_mf_search_kernel(Table* table,
                                     const typename Table::key_type* const keys,
-                                    char* const vals, size_t len,
+                                    char* vals, size_t len,
                                     size_t pull_feature_value_size) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  // return;
   if (i < len) {
     auto it = table->find(keys[i]);
 
     if (it != table->end()) {
-      *(FeatureValue*)(vals + i * pull_feature_value_size) = *(it->second);
+      uint64_t offset = i * pull_feature_value_size;
+      FeatureValue* cur = (FeatureValue*)(vals + offset);
+      FeatureValue& input = *(FeatureValue*)(it->second);
+      cur->slot = input.slot;
+      cur->show = input.show;
+      cur->clk = input.clk;
+      cur->mf_dim = input.mf_dim;
+      cur->lr = input.lr;
+      cur->mf_size = input.mf_size;
+      cur->cpu_ptr = input.cpu_ptr;
+      cur->delta_score = input.delta_score;
+      cur->lr_g2sum = input.lr_g2sum;
+      for (int j = 0; j < cur->mf_dim + 1; ++j) {
+        cur->mf[j] = input.mf[j];
+      }
     }
   }
 }
@@ -121,7 +138,7 @@ __global__ void dy_mf_update_kernel(Table* table,
       FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size);
       sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur);
     } else {
-      printf("yxf::push miss key: %d", keys[i]);
+      printf("warning: push miss key: %d", keys[i]);
     }
   }
 }
@@ -201,7 +218,8 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
 template <typename KeyType, typename ValType>
 template <typename StreamType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
-                                         char* pool, size_t start_index,
+                                         char* pool, size_t feature_value_size,
+                                         size_t start_index,
                                          StreamType stream) {
   if (len == 0) {
     return;
@@ -210,8 +228,8 @@ void HashTable<KeyType, ValType>::insert(const KeyType* d_keys, size_t len,
     return;
   }
   const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
-  insert_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(container_, d_keys, len,
-                                                       pool, start_index);
+  insert_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
+      container_, d_keys, len, pool, feature_value_size, start_index);
 }
 
 template <typename KeyType, typename ValType>
@@ -319,10 +337,12 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
 }
 
 template class HashTable<unsigned long, paddle::framework::FeatureValue>;
+template class HashTable<unsigned long, paddle::framework::FeatureValue*>;
 template class HashTable<long, int>;
 template class HashTable<unsigned long, int>;
 template class HashTable<unsigned long, unsigned long>;
 template class HashTable<unsigned long, long>;
+template class HashTable<unsigned long, long*>;
 template class HashTable<long, long>;
 template class HashTable<long, unsigned long>;
 template class HashTable<long, unsigned int>;
@@ -332,6 +352,10 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::get<
                   paddle::framework::FeatureValue* d_vals, size_t len,
                   cudaStream_t stream);
 
+template void
+HashTable<unsigned long, paddle::framework::FeatureValue*>::get<cudaStream_t>(
+    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t stream);
+
 template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
                                                       int* d_vals, size_t len,
                                                       cudaStream_t stream);
@@ -357,6 +381,11 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert<
                   const paddle::framework::FeatureValue* d_vals, size_t len,
                   cudaStream_t stream);
 
+template void HashTable<unsigned long, paddle::framework::FeatureValue*>::
+    insert<cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
+                         size_t feature_value_size, size_t start_index,
+                         cudaStream_t stream);
+
 template void HashTable<long, int>::insert<cudaStream_t>(const long* d_keys,
                                                          const int* d_vals,
                                                          size_t len,
@@ -382,11 +411,6 @@ template void HashTable<long, unsigned int>::insert<cudaStream_t>(
     const long* d_keys, const unsigned int* d_vals, size_t len,
     cudaStream_t stream);
 
-// template void HashTable<unsigned long,
-// paddle::framework::FeatureValue>::insert<
-//    cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
-//                  size_t start_index, cudaStream_t stream);
-
 template void HashTable<unsigned long, paddle::framework::FeatureValue>::
     dump_to_cpu<cudaStream_t>(int devid, cudaStream_t stream);
 
@@ -401,6 +425,16 @@ template void HashTable<unsigned long, paddle::framework::FeatureValue>::update<
                                   sgd,
                   cudaStream_t stream);
 
+template void
+HashTable<unsigned long, paddle::framework::FeatureValue*>::update<
+    Optimizer<paddle::framework::FeatureValue,
+              paddle::framework::FeaturePushValue>,
+    cudaStream_t>(const unsigned long* d_keys, const char* d_grads, size_t len,
+                  Optimizer<paddle::framework::FeatureValue,
+                            paddle::framework::FeaturePushValue>
+                      sgd,
+                  cudaStream_t stream);
+
 // template void HashTable<unsigned long,
 // paddle::framework::FeatureValue>::update<
 //    Optimizer<paddle::framework::FeatureValue,