Fix async memory alloc bug & reuse scope variable memory (PaddlePaddle#59)

xcpher · zmxdream · commit 07a66bef2634 · 2022-09-28T01:55:48.000+08:00
* fix async alloc bug

* use stream safe alloc

* alloc fix &amp; reuse scope mem
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
@@ -2745,15 +2745,17 @@ void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num, MiniBatchG
         h_tensor_ptrs[j] = float_tensor.data<float>() + float_offset;
         float_offset += total_instance;
       } else {
-        h_tensor_ptrs[j] = pack->float_tensor_vec()[float_zero_slot_index].mutable_data<float>({total_instance, 1}, this->place_);
+        h_tensor_ptrs[j] = pack->float_tensor_vec()[float_zero_slot_index].mutable_data<float>({total_instance, 1},
+                            this->place_);
         float_zero_slot_index++;
       }
     } else if (info.type[0] == 'u') {  // uint64
       if (total_instance > 0) {
         h_tensor_ptrs[j] = uint64_tensor.data<int64_t>() + uint64_offset;
         uint64_offset += total_instance;
       } else {
-        h_tensor_ptrs[j] = pack->uint64_tensor_vec()[uint64_zero_slot_index].mutable_data<int64_t>({total_instance, 1}, this->place_);
+        h_tensor_ptrs[j] = pack->uint64_tensor_vec()[uint64_zero_slot_index].mutable_data<int64_t>({total_instance, 1},
+                            this->place_);
         uint64_zero_slot_index++;
       }
     }
@@ -2869,10 +2871,12 @@ MiniBatchGpuPack* SlotRecordInMemoryDataFeed::get_pack(MiniBatchGpuPack* last_pa
 
 
 MiniBatchGpuPack::MiniBatchGpuPack(const paddle::platform::Place& place,
-                                   const std::vector<UsedSlotInfo>& infos) {
+                                   const std::vector<UsedSlotInfo>& infos,
+                                   phi::StreamId stream_id) {
   place_ = place;
   stream_holder_.reset(new platform::stream::CUDAStream(place));
   stream_ = stream_holder_->raw_stream();
+  alloc_stream_id_ = stream_id;
 
   ins_num_ = 0;
   pv_num_ = 0;
@@ -2892,7 +2896,7 @@ MiniBatchGpuPack::MiniBatchGpuPack(const paddle::platform::Place& place,
   }
   copy_host2device(&gpu_slots_, gpu_used_slots_.data(), gpu_used_slots_.size());
 
-  slot_buf_ptr_ = memory::AllocShared(place_, used_slot_size_ * sizeof(void*));
+  slot_buf_ptr_ = memory::AllocShared(place_, used_slot_size_ * sizeof(void*), phi_stream());
 
   int device_id = place_.GetDeviceId();
   VLOG(3) << "begin get batch pack device id: " << device_id;
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
@@ -549,7 +549,8 @@ struct BatchGPUValue {
 class MiniBatchGpuPack {
  public:
   MiniBatchGpuPack(const paddle::platform::Place& place,
-                   const std::vector<UsedSlotInfo>& infos);
+                   const std::vector<UsedSlotInfo>& infos,
+                   phi::StreamId stream_id);
   ~MiniBatchGpuPack();
 
   bool is_use();
@@ -570,14 +571,14 @@ class MiniBatchGpuPack {
     if (used_float_num_ > 0) {
       int float_total_len = buf_.h_float_lens.back();
       if (float_total_len > 0) {
-        float_tensor_.mutable_data<float>({float_total_len, 1}, this->place_);
+        float_tensor_.mutable_data<float>({float_total_len, 1}, this->place_, phi_stream());
       }
     }
     if (used_uint64_num_ > 0) {
       int uint64_total_len = buf_.h_uint64_lens.back();
       if (uint64_total_len > 0) {
         uint64_tensor_.mutable_data<int64_t>({uint64_total_len, 1},
-                                             this->place_);
+                                             this->place_, phi_stream());
       }
     }
   }
@@ -595,9 +596,9 @@ class MiniBatchGpuPack {
 
   void resize_gpu_slot_offsets(const size_t slot_total_bytes) {
     if (gpu_slot_offsets_ == nullptr) {
-      gpu_slot_offsets_ = memory::AllocShared(place_, slot_total_bytes);
+      gpu_slot_offsets_ = memory::AllocShared(place_, slot_total_bytes, phi_stream());
     } else if (gpu_slot_offsets_->size() < slot_total_bytes) {
-      auto buf = memory::AllocShared(place_, slot_total_bytes);
+      auto buf = memory::AllocShared(place_, slot_total_bytes, phi_stream());
       gpu_slot_offsets_.swap(buf);
       buf = nullptr;
     }
@@ -613,6 +614,11 @@ class MiniBatchGpuPack {
     return stream_;
   }
 
+  // only for interface compatibility
+  phi::Stream phi_stream() {
+    return phi::Stream(alloc_stream_id_);
+  }
+
  private:
   void transfer_to_gpu(void);
   void pack_all_data(const SlotRecord* ins_vec, int num);
@@ -666,6 +672,8 @@ class MiniBatchGpuPack {
 
   std::shared_ptr<phi::Allocation> gpu_slot_offsets_ = nullptr;
   std::shared_ptr<phi::Allocation> slot_buf_ptr_ = nullptr;
+
+  phi::StreamId alloc_stream_id_ {0};
 };
 class MiniBatchGpuPackMgr {
   static const int MAX_DEIVCE_NUM = 16;
@@ -700,14 +708,24 @@ class MiniBatchGpuPackMgr {
         return pack_list_[device_id][i];
       }
     }
-    auto* pack = new MiniBatchGpuPack(place, infos);
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      if (!alloc_stream_map_.count(device_id)) {
+        alloc_stream_map_.emplace(device_id, new platform::stream::CUDAStream(place));
+      }
+    }
+    phi::StreamId alloc_stream_id =
+          reinterpret_cast<phi::StreamId>(alloc_stream_map_[device_id]->raw_stream());
+    auto* pack = new MiniBatchGpuPack(place, infos, alloc_stream_id);
     pack->set_use_flag(true);
     pack_list_[device_id].push_back(pack);
     return pack;
   }
 
  private:
   std::vector<std::vector<MiniBatchGpuPack*>> pack_list_;
+  std::unordered_map<int, std::unique_ptr<platform::stream::CUDAStream>> alloc_stream_map_;
+  std::mutex mutex_;
 };
 // global mgr
 inline MiniBatchGpuPackMgr& BatchGpuPackMgr() {
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
@@ -643,6 +643,7 @@ class PSGPUWorker : public HogwildWorker {
   // async infershape
   int task_threads_num_ {1};
   int scope_num_ {task_threads_num_ + 1};
+  // int scope_num_ {1};
   std::atomic<int> thread_count_ {0};
   std::atomic<bool> stop_token_ {false};
   std::atomic<bool> pack_is_end_ {false};
diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc
@@ -58,6 +58,41 @@ void PSGPUWorker::CreateDeviceResource(const ProgramDesc& main_prog) {
     for (auto& op : ops_) {
       op->SetIsRuntimeInferShape(true);
     }
+
+    // reusing memory
+    auto input_names = device_reader_->GetInputVarNames();
+    std::set<std::string> input_names_set(input_names.begin(), input_names.end());
+    for (auto& scope : thread_scope_vec_) {
+      std::vector<Variable*> need_reuse;
+      for (auto& var : block.AllVars()) {
+        std::string name = var->Name();
+        if (!var->Persistable()) {
+          if (input_names_set.find(var->Name()) != input_names_set.end()) {
+            continue;
+          }
+          auto* ptr = scope->FindLocalVar(var->Name());
+          PADDLE_ENFORCE_NE(ptr, nullptr,
+                phi::errors::NotFound("The var %s is not found.", var->Name()));
+          need_reuse.push_back(ptr);
+        }
+      }
+      need_reuse_var_vec_[scope] = std::move(need_reuse);
+    }
+    {
+      need_reuse_var_.clear();
+      for (auto& var : block.AllVars()) {
+        std::string name = var->Name();
+        if (!var->Persistable()) {
+          if (input_names_set.find(var->Name()) != input_names_set.end()) {
+            continue;
+          }
+          auto* ptr = thread_scope_->FindLocalVar(var->Name());
+          PADDLE_ENFORCE_NE(ptr, nullptr,
+                phi::errors::NotFound("The var %s is not found.", var->Name()));
+          need_reuse_var_.push_back(ptr);
+        }
+      }
+    }
   }
 }
 
@@ -400,6 +435,18 @@ void PSGPUWorker::TrainFiles() {
           std::chrono::microseconds(200));
       }
       thread_scope = cur_task.scope;
+      // tensor share buffer
+      std::vector<Variable*>& cur_scope_vars = need_reuse_var_vec_[thread_scope];
+      PADDLE_ENFORCE_EQ(cur_scope_vars.size(), need_reuse_var_.size(),
+                        platform::errors::Fatal(
+                              "reuse vars size must be same."));
+      for (size_t i = 0; i < need_reuse_var_.size(); i++) {
+        Variable* child = cur_scope_vars[i];
+        Variable* parent = need_reuse_var_[i];
+        if (child->IsType<LoDTensor>()) {
+          child->GetMutable<LoDTensor>()->ShareBufferWith(*(parent->GetMutable<LoDTensor>()));
+        }
+      }
     }
 
     if (cur_batch <= 0) {
@@ -409,9 +456,11 @@ void PSGPUWorker::TrainFiles() {
     total_ins_num += cur_batch;
 
     if (shape_check_flag_.load()) {
-      VLOG(0) << "Begin OpRunAndShapeCheck... "
+      VLOG(0) << "Begin OpRunAndShapeCheck, "
+            << shape_check_count_.load();
+      if (scope_num_ == 1 || shape_check_count_.fetch_sub(1) <= 0) {
+        VLOG(0) << "End OpRunAndShapeCheck."
             << shape_check_count_.load();
-      if (shape_check_count_.fetch_sub(1) <= 0) {
         shape_check_flag_ = false;
       }
     }
@@ -514,6 +563,17 @@ void PSGPUWorker::TrainFiles() {
     ++batch_cnt;
 
     if (scope_num_ != 1) {
+      std::vector<Variable*>& cur_scope_vars = need_reuse_var_vec_[thread_scope];
+      PADDLE_ENFORCE_EQ(cur_scope_vars.size(), need_reuse_var_.size(),
+                        platform::errors::Fatal(
+                              "reuse vars size must be same."));
+      for (size_t i = 0; i < need_reuse_var_.size(); i++) {
+        Variable* child = cur_scope_vars[i];
+        Variable* parent = need_reuse_var_[i];
+        if (child->IsType<LoDTensor>()) {
+          parent->GetMutable<LoDTensor>()->ShareBufferWith(*(child->GetMutable<LoDTensor>()));
+        }
+      }
       device_reader_->get_pack(cur_task.pack);
       free_task_queue_.Push(cur_task);
     }
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -369,7 +369,7 @@ class AllocatorFacadePrivate {
       bool create_if_not_found = false) {
     if (LIKELY(!IsCUDAGraphCapturing())) {
       if (stream == GetDefaultStream(place)) {
-        VLOG(7) << "Get Allocator by passing in a default stream";
+        VLOG(0) << "Get Allocator by passing in a default stream";
         return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
       }
     }
@@ -391,6 +391,7 @@ class AllocatorFacadePrivate {
     /* unique_lock_guard */ {
       std::unique_lock<std::shared_timed_mutex> lock_guard(
           cuda_allocator_mutex_);
+      VLOG(0) << "InitStreamSafeCUDAAllocator of " << reinterpret_cast<uint64_t>(stream);
       InitStreamSafeCUDAAllocator(place, stream);
       return cuda_allocators_[place][stream];
     }
diff --git a/paddle/phi/core/dense_tensor.inl b/paddle/phi/core/dense_tensor.inl
@@ -48,6 +48,11 @@ T* mutable_data(const DDim& dims,
                 const phi::Place& place,
                 size_t requested_size = 0);
 
+template <typename T>
+T* mutable_data(const DDim& dims,
+                const phi::Place& place,
+                const phi::Stream& stream);
+
 void* mutable_data(const phi::Place& place,
                     paddle::experimental::DataType type,
                     size_t requested_size = 0);
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
@@ -167,6 +167,15 @@ inline T* DenseTensor::mutable_data(const DDim& dims,
   return mutable_data<T>(place, requested_size);
 }
 
+template <typename T>
+inline T* DenseTensor::mutable_data(const DDim& dims,
+                                    const Place& place,
+                                    const phi::Stream& stream) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  meta_.dims = dims;
+  return reinterpret_cast<T*>(mutable_data(place, paddle::experimental::CppTypeToDataType<T>::Type(), stream));
+}
+
 template <typename T>
 inline T* DenseTensor::mutable_data(const Place& place, size_t requested_size) {
   static_assert(std::is_pod<T>::value, "T must be POD");
@@ -186,7 +195,9 @@ void DenseTensor::ShareBufferWith(const DenseTensor& tensor) {
   template dtype* DenseTensor::mutable_data(                        \
       const DDim& dims, const Place& place, size_t requested_size); \
   template dtype* DenseTensor::mutable_data(const Place& place,     \
-                                            size_t requested_size);
+                                            size_t requested_size); \
+  template dtype* DenseTensor::mutable_data(                        \
+      const DDim& dims, const Place& place, const phi::Stream& stream);
 
 LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(bool)
 LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int8_t)

Original file line number	Diff line number	Diff line change
`@@ -369,7 +369,7 @@ class AllocatorFacadePrivate {`
`369`	`369`	`bool create_if_not_found = false) {`
`370`	`370`	`if (LIKELY(!IsCUDAGraphCapturing())) {`
`371`	`371`	`if (stream == GetDefaultStream(place)) {`
`372`		`- VLOG(7) << "Get Allocator by passing in a default stream";`
	`372`	`+ VLOG(0) << "Get Allocator by passing in a default stream";`
`373`	`373`	`return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);`
`374`	`374`	`}`
`375`	`375`	`}`
`@@ -391,6 +391,7 @@ class AllocatorFacadePrivate {`
`391`	`391`	`/* unique_lock_guard */ {`
`392`	`392`	`std::unique_lock<std::shared_timed_mutex> lock_guard(`
`393`	`393`	`cuda_allocator_mutex_);`
	`394`	`+ VLOG(0) << "InitStreamSafeCUDAAllocator of " << reinterpret_cast<uint64_t>(stream);`
`394`	`395`	`InitStreamSafeCUDAAllocator(place, stream);`
`395`	`396`	`return cuda_allocators_[place][stream];`
`396`	`397`	`}`