refine pir dist to_static to support master weight (PaddlePaddle#65089)

zhiqiu · co63oc · commit 98e024ac8c02 · 2024-06-25T11:42:04.000+08:00
* refine pir dist to_static

* fix bug

* fix partial
diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc
@@ -490,9 +490,10 @@ void BindAutoParallel(py::module *m) {
                        .def(py::self == py::self)   // NOLINT
                        .def(py::self != py::self);  // NOLINT
 
-  auto Partial = py::class_<phi::distributed::Partial,
-                            std::shared_ptr<phi::distributed::Partial>>(
-                     *m, "Partial", Placement, R"DOC(
+  auto Partial =
+      py::class_<phi::distributed::Partial,
+                 std::shared_ptr<phi::distributed::Partial>>(
+          *m, "Partial", Placement, R"DOC(
                  The `Partial` describes `Tensor` across multiple devices, this type of tensor has the same shape but only a fraction of the value, which can be further reduce (e.g. sum/min/max) to obtain dist_tensor, often used as an intermediate representation.
 
                  Parameters:
@@ -510,12 +511,13 @@ void BindAutoParallel(py::module *m) {
                          >>> d_tensor = dist.shard_tensor(a, mesh, [dist.Partial()])
 
                  )DOC")
-                     .def(py::init<phi::ReduceType>(),
-                          py::arg("reduce_type") = phi::ReduceType::kRedSum)
-                     .def("__hash__", &phi::distributed::Partial::hash)
-                     .def("__str__", &phi::distributed::Partial::to_string)
-                     .def(py::self == py::self)   // NOLINT
-                     .def(py::self != py::self);  // NOLINT
+          .def(py::init<phi::ReduceType>(),
+               py::arg("reduce_type") = phi::ReduceType::kRedSum)
+          .def("reduce_type", &phi::distributed::Partial::get_reduce_type)
+          .def("__hash__", &phi::distributed::Partial::hash)
+          .def("__str__", &phi::distributed::Partial::to_string)
+          .def(py::self == py::self)   // NOLINT
+          .def(py::self != py::self);  // NOLINT
 
   g_placement_shard_pytype = reinterpret_cast<PyTypeObject *>(Shard.ptr());
   g_placement_replicated_pytype =
diff --git a/paddle/fluid/pybind/dist_static_op_function.h b/paddle/fluid/pybind/dist_static_op_function.h
@@ -43,38 +43,11 @@ static PyObject *static_api_shard_tensor(PyObject *self,
     auto placements = CastPyArg2VectorOfPlacement(placements_obj, 2);
 
     int64_t ndim = GetValueDims(input).size();
-    std::vector<int64_t> dim_map(ndim, -1);
-    for (size_t i = 0; i < placements.size(); i++) {
-      auto &placement = placements[i];
-      if (placement->is_shard()) {
-        auto shard_dim =
-            dynamic_cast<const phi::distributed::Shard &>(*placement).get_dim();
-        PADDLE_ENFORCE_EQ(
-            dim_map[shard_dim],
-            -1,
-            common::errors::InvalidArgument(
-                "Tensor dim %lld is already sharded on mesh dim %lld,"
-                " DistTensor operator implementation does not support things "
-                "like hybrid"
-                " sharding strategies yet (i.e. [Shard(0), Shard(0)])",
-                shard_dim,
-                dim_map[shard_dim]));
-        dim_map[shard_dim] = i;
-      }
-    }
-    paddle::flat_hash_map<int64_t, phi::ReduceType> partial_status;
-    for (size_t i = 0; i < placements.size(); ++i) {
-      auto &p = placements[i];
-      if (p->is_partial()) {
-        partial_status.insert(
-            {i,
-             dynamic_cast<phi::distributed::Partial &>(*p).get_reduce_type()});
-      }
-    }
+    auto res = CvtPlacements(placements, ndim);
 
     // Call ir static api
     auto static_api_out = paddle::dialect::shard_tensor(
-        input, process_mesh, dim_map, partial_status);
+        input, process_mesh, std::get<0>(res), std::get<1>(res));
 
     return ToPyObject(static_api_out);
   } catch (...) {
@@ -101,38 +74,11 @@ static PyObject *static_api_reshard(PyObject *self,
     auto placements = CastPyArg2VectorOfPlacement(placements_obj, 2);
 
     int64_t ndim = GetValueDims(input).size();
-    std::vector<int64_t> dim_map(ndim, -1);
-    for (size_t i = 0; i < placements.size(); i++) {
-      auto &placement = placements[i];
-      if (placement->is_shard()) {
-        auto shard_dim =
-            dynamic_cast<const phi::distributed::Shard &>(*placement).get_dim();
-        PADDLE_ENFORCE_EQ(
-            dim_map[shard_dim],
-            -1,
-            common::errors::InvalidArgument(
-                "Tensor dim %lld is already sharded on mesh dim %lld,"
-                " DistTensor operator implementation does not support things "
-                "like hybrid"
-                " sharding strategies yet (i.e. [Shard(0), Shard(0)])",
-                shard_dim,
-                dim_map[shard_dim]));
-        dim_map[shard_dim] = i;
-      }
-    }
-    paddle::flat_hash_map<int64_t, phi::ReduceType> partial_status;
-    for (size_t i = 0; i < placements.size(); ++i) {
-      auto &p = placements[i];
-      if (p->is_partial()) {
-        partial_status.insert(
-            {i,
-             dynamic_cast<phi::distributed::Partial &>(*p).get_reduce_type()});
-      }
-    }
+    auto res = CvtPlacements(placements, ndim);
 
     // Call ir static api
-    auto static_api_out =
-        paddle::dialect::reshard(input, process_mesh, dim_map, partial_status);
+    auto static_api_out = paddle::dialect::reshard(
+        input, process_mesh, std::get<0>(res), std::get<1>(res));
 
     return ToPyObject(static_api_out);
   } catch (...) {
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
@@ -2867,4 +2867,37 @@ void BindEagerUtils(PyObject* module) {
   }
 }
 
+std::tuple<std::vector<int64_t>,
+           paddle::flat_hash_map<int64_t, phi::ReduceType>>
+CvtPlacements(Placements placements, int ndim) {
+  std::vector<int64_t> dim_map(ndim, -1);
+  for (size_t i = 0; i < placements.size(); i++) {
+    auto& placement = placements[i];
+    if (placement->is_shard()) {
+      auto shard_dim =
+          dynamic_cast<const phi::distributed::Shard&>(*placement).get_dim();
+      PADDLE_ENFORCE_EQ(
+          dim_map[shard_dim],
+          -1,
+          common::errors::InvalidArgument(
+              "Tensor dim %lld is already sharded on mesh dim %lld,"
+              " DistTensor operator implementation does not support things "
+              "like hybrid"
+              " sharding strategies yet (i.e. [Shard(0), Shard(0)])",
+              shard_dim,
+              dim_map[shard_dim]));
+      dim_map[shard_dim] = i;
+    }
+  }
+  paddle::flat_hash_map<int64_t, phi::ReduceType> partial_status;
+  for (size_t i = 0; i < placements.size(); ++i) {
+    auto& p = placements[i];
+    if (p->is_partial()) {
+      partial_status.insert(
+          {i, dynamic_cast<phi::distributed::Partial&>(*p).get_reduce_type()});
+    }
+  }
+  return {dim_map, partial_status};
+}
+
 }  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
@@ -507,5 +507,9 @@ void ConvertAllInputsToDistTensor(const phi::distributed::ProcessMesh* mesh,
 void ConvertToDistTensor(Tensor* x, const phi::distributed::ProcessMesh* mesh);
 void BindEagerUtils(PyObject* module);
 
+std::tuple<std::vector<int64_t>,
+           paddle::flat_hash_map<int64_t, phi::ReduceType>>
+CvtPlacements(phi::distributed::Placements placements, int ndim);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.cc b/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.cc
@@ -47,5 +47,9 @@ const distributed::TensorDistAttr& DistMetaTensor::dist_attr() const {
   }
 }
 
+bool DistMetaTensor::initialized() const {
+  return tensor_ != nullptr || dist_attr_ != TensorDistAttr();
+}
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h b/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h
@@ -48,6 +48,8 @@ class DistMetaTensor : public MetaTensor {
 
   const distributed::TensorDistAttr& dist_attr() const;
 
+  bool initialized() const override;
+
  private:
   /**
    * Note: When using the semi-automatic parallel segmentation derivation rules
diff --git a/python/paddle/distributed/auto_parallel/placement_type.py b/python/paddle/distributed/auto_parallel/placement_type.py
@@ -76,6 +76,7 @@ def to_dim_map(placements, tensor_dims):
         List[int]: a list of integer that represents sharding on each tensor dimension.
     """
     dim_map = [-1] * tensor_dims
+    partial_status = {}
     for i, placement in enumerate(placements):
         if placement.is_shard():
             shard_dim = cast(Shard, placement).get_dim()
@@ -85,13 +86,15 @@ def to_dim_map(placements, tensor_dims):
                 )
 
             dim_map[shard_dim] = i
+        if placement.is_partial():
+            partial_status[i] = cast(Partial, placement).reduce_type()
 
-    return dim_map
+    return dim_map, partial_status
 
 
 def get_shard_spec(mesh, placements, tensor_dims):
     """to get shard_spec for construct DistAttr for static API."""
-    dim_map = to_dim_map(placements, tensor_dims)
+    dim_map, _ = to_dim_map(placements, tensor_dims)
     mesh_dim_names = mesh.dim_names
     shard_spec = [None] * len(dim_map)
     for i, d in enumerate(dim_map):
diff --git a/python/paddle/jit/pir_dy2static/parameter_recorder.py b/python/paddle/jit/pir_dy2static/parameter_recorder.py
@@ -48,23 +48,10 @@ def get(self, program, tensor):
                 name=tensor.name,
                 initializer=non_used_initializer,
                 trainable=(not tensor.stop_gradient),
+                placements=tensor.placements,
+                process_mesh=tensor.process_mesh,
             )
 
-            if tensor.placements is not None:  # import for shard tensor api
-                import paddle.distributed as dist
-
-                dist_value = dist.shard_tensor(
-                    value,
-                    tensor.process_mesh,
-                    tensor.placements,
-                    stop_gradient=value.stop_gradient,
-                )
-                value.set_type(dist_value.type())
-                value.get_defining_op().dist_attr = (
-                    dist_value.get_defining_op().dist_attr
-                )
-                dist_value.block.remove_op(dist_value.get_defining_op())
-
             if isinstance(tensor, paddle.Tensor):
                 params.add(tensor)
             mappings[id(tensor)] = value
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
@@ -835,14 +835,27 @@ def get_param_from_startup(startup, name):
                     startup_param = get_param_from_startup(
                         startup_program, param.name
                     )
-                    var = paddle.cast(startup_param, 'float32')
-                    var.persistable = True
-                    paddle._pir_ops.set_persistable_value(var, var_name)
+                    startup_var = paddle.cast(startup_param, 'float32')
+                    startup_var.persistable = True
+                    paddle._pir_ops.set_persistable_value(startup_var, var_name)
                 with paddle.static.program_guard(main_program):
                     paddle.pir.reset_insertion_point_to_start()
                     var = paddle.static.data(
-                        var_name, var.shape, var.dtype, core.Place()
+                        var_name,
+                        startup_var.shape,
+                        startup_var.dtype,
+                        core.Place(),
                     )
+                    if startup_var.is_dist():
+                        var.set_type(startup_var.type())
+                        op_dist_attr = (
+                            paddle.base.libpaddle.pir.create_op_dist_attribute(
+                                startup_var.dist_attr().process_mesh,
+                                [],
+                                [startup_var.dist_attr()],
+                            )
+                        )
+                        var.get_defining_op().dist_attr = op_dist_attr
                     var.persistable = True
             elif framework.in_dygraph_mode():
                 var = paddle.cast(param, 'float32')
diff --git a/python/paddle/pir/core.py b/python/paddle/pir/core.py
@@ -317,12 +317,44 @@ def create_parameter(
     main_program = default_main_program()
     parameter_meta = ParameterMeta(shape, dtype)
 
+    is_dist = False
+    if (
+        'placements' in kwargs
+        and kwargs['placements']
+        and 'process_mesh' in kwargs
+        and kwargs['process_mesh']
+    ):
+        is_dist = True
+
+    def to_dist(value):
+        import paddle
+        import paddle.distributed as dist
+
+        process_mesh = kwargs['process_mesh']
+        dim_map, partial_status = dist.auto_parallel.placement_type.to_dim_map(
+            kwargs['placements'], len(shape)
+        )
+        dist_attr = paddle.base.libpaddle.pir.create_tensor_dist_attribute(
+            process_mesh, dim_map, partial_status
+        )
+        dist_type = paddle.base.libpaddle.pir.cvt_to_dist_type(
+            value.type(), dist_attr
+        )
+        value.set_type(dist_type)
+        op_dist_attr = paddle.base.libpaddle.pir.create_op_dist_attribute(
+            process_mesh, [], [dist_attr]
+        )
+        value.get_defining_op().dist_attr = op_dist_attr
+
     with program_guard(startup_program):
         initializer = kwargs['initializer']
         init_result = initializer(
             parameter_meta, startup_program.global_block()
         )
         init_result.persistable = True
+        if is_dist:
+            to_dist(init_result)
+
         set_parameter(init_result, value_name)
 
     main_program.set_parameters_from(startup_program)
@@ -331,6 +363,9 @@ def create_parameter(
         param = parameter(value_name)
         param.persistable = True
 
+        if is_dist:
+            to_dist(param)
+
     param.trainable = kwargs.get('trainable', True)
     param.stop_gradient = not param.trainable
     param.optimize_attr = kwargs.get('optimize_attr', {'learning_rate': 1.0})

Original file line number	Diff line number	Diff line change
`@@ -47,5 +47,9 @@ const distributed::TensorDistAttr& DistMetaTensor::dist_attr() const {`
`47`	`47`	`}`
`48`	`48`	`}`
`49`	`49`
	`50`	`+bool DistMetaTensor::initialized() const {`
	`51`	`+ return tensor_ != nullptr \|\| dist_attr_ != TensorDistAttr();`
	`52`	`+}`
	`53`	`+`
`50`	`54`	`} // namespace distributed`
`51`	`55`	`} // namespace phi`