PaddlePaddle
diff --git a/‎paddle/fluid/pir/dialect/distributed/ir/dist_op.cc
Lines changed: 1 addition & 24 deletions b/‎paddle/fluid/pir/dialect/distributed/ir/dist_op.cc
Lines changed: 1 addition & 24 deletions
diff --git a/‎paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc
Lines changed: 158 additions & 0 deletions b/‎paddle/fluid/pir/dialect/distributed/ir/dist_tools.cc
Lines changed: 158 additions & 0 deletions
diff --git a/‎paddle/fluid/pir/dialect/distributed/ir/dist_tools.h
Lines changed: 7 additions & 0 deletions b/‎paddle/fluid/pir/dialect/distributed/ir/dist_tools.h
Lines changed: 7 additions & 0 deletions
diff --git a/‎paddle/fluid/pir/dialect/op_generator/op_infermeta_func_gen.py
Lines changed: 11 additions & 1 deletion b/‎paddle/fluid/pir/dialect/op_generator/op_infermeta_func_gen.py
Lines changed: 11 additions & 1 deletion
diff --git a/‎paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
Lines changed: 2 additions & 2 deletions b/‎paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/fluid/pir/dialect/operator/utils/utils.cc
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/pir/dialect/operator/utils/utils.cc
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/phi/api/lib/api_gen_utils.cc
Lines changed: 10 additions & 3 deletions b/‎paddle/phi/api/lib/api_gen_utils.cc
Lines changed: 10 additions & 3 deletions
diff --git a/‎paddle/phi/api/lib/data_transform.cc
Lines changed: 6 additions & 2 deletions b/‎paddle/phi/api/lib/data_transform.cc
Lines changed: 6 additions & 2 deletions
diff --git a/‎paddle/phi/core/distributed/auto_parallel/dist_tensor.h
Lines changed: 4 additions & 0 deletions b/‎paddle/phi/core/distributed/auto_parallel/dist_tensor.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎paddle/phi/infermeta/spmd_rules/amp_ops.cc
Lines changed: 10 additions & 1 deletion b/‎paddle/phi/infermeta/spmd_rules/amp_ops.cc
Lines changed: 10 additions & 1 deletion
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_op.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_tools.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_type.h"
 #include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
 #include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
@@ -273,30 +274,6 @@ void ReshardOp::VerifySig() {
   VLOG(4) << "End Verifying for: ShardTensorOp.";
 }
 
-ProcessMeshAttribute MergeMeshes(const ProcessMeshAttribute& mesh1,
-                                 const ProcessMeshAttribute& mesh2) {
-  if (mesh1 == mesh2) return mesh1;
-  // Combine the two ids
-  std::vector<int64_t> merged_ids;
-  std::vector<int64_t> ids1 = mesh1.process_ids();
-  std::vector<int64_t> ids2 = mesh2.process_ids();
-
-  merged_ids.reserve(ids1.size() + ids2.size());
-  merged_ids.insert(merged_ids.end(), ids1.begin(), ids1.end());
-  merged_ids.insert(merged_ids.end(), ids2.begin(), ids2.end());
-
-  // Remove duplicates
-  std::sort(merged_ids.begin(), merged_ids.end());
-  auto last = std::unique(merged_ids.begin(), merged_ids.end());
-  merged_ids.erase(last, merged_ids.end());
-
-  return ProcessMeshAttribute::get(
-      pir::IrContext::Instance(),
-      {static_cast<int64_t>(merged_ids.size())},  // flatten mesh shape
-      merged_ids,
-      {"merged"});
-}
-
 void ReshardOp::Build(pir::Builder& builder,
                       pir::OperationArgument& argument,
                       pir::Value input,
 
@@ -13,12 +13,152 @@
 // limitations under the License.
 
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_tools.h"
+
+#include <unordered_set>
+
 #include "glog/logging.h"
 #include "paddle/common/enforce.h"
+#include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h"
 #include "paddle/pir/include/core/operation.h"
 
 namespace paddle::dialect {
 
+ProcessMeshAttribute MergeMeshes(const ProcessMeshAttribute& mesh1,
+                                 const ProcessMeshAttribute& mesh2) {
+  if (mesh1 == mesh2) return mesh1;
+  // Combine the two ids
+  std::vector<int64_t> merged_ids;
+  std::vector<int64_t> ids1 = mesh1.process_ids();
+  std::vector<int64_t> ids2 = mesh2.process_ids();
+
+  merged_ids.reserve(ids1.size() + ids2.size());
+  merged_ids.insert(merged_ids.end(), ids1.begin(), ids1.end());
+  merged_ids.insert(merged_ids.end(), ids2.begin(), ids2.end());
+
+  // Remove duplicates
+  std::sort(merged_ids.begin(), merged_ids.end());
+  auto last = std::unique(merged_ids.begin(), merged_ids.end());
+  merged_ids.erase(last, merged_ids.end());
+
+  return ProcessMeshAttribute::get(
+      pir::IrContext::Instance(),
+      {static_cast<int64_t>(merged_ids.size())},  // flatten mesh shape
+      merged_ids,
+      {"merged"});
+}
+
+ProcessMeshAttribute MergeInputMeshes(const std::vector<pir::Value>& inputs) {
+  auto ctx = pir::IrContext::Instance();
+  auto mesh = ProcessMeshAttribute::get(ctx, {}, {}, {});
+  for (auto value : inputs) {
+    if (auto dist_type = value.type().dyn_cast<DistTypeInterface>()) {
+      mesh = MergeMeshes(mesh, dist_type.process_mesh_attr());
+    } else {
+      auto vec_type = value.type().dyn_cast<pir::VectorType>();
+      if (!vec_type) {
+        continue;
+      }
+      for (size_t idx = 0; idx < vec_type.size(); ++idx) {
+        if (auto dist_type = vec_type[idx].dyn_cast<DistTypeInterface>()) {
+          mesh = MergeMeshes(mesh, dist_type.process_mesh_attr());
+        }
+      }
+    }
+  }
+  return mesh;
+}
+
+ProcessMeshAttribute CreateGlobalMesh(const std::vector<pir::Value>& inputs) {
+  auto ctx = pir::IrContext::Instance();
+  struct MyHash {
+    std::size_t operator()(const ProcessMeshAttribute& obj) const {
+      return obj.hash();
+    }
+  };
+  std::unordered_set<ProcessMeshAttribute, MyHash> meshes;
+  for (auto value : inputs) {
+    if (auto dist_type = value.type().dyn_cast<DistTypeInterface>()) {
+      meshes.insert(dist_type.process_mesh_attr());
+    } else {
+      if (auto vec_type = value.type().dyn_cast<pir::VectorType>()) {
+        for (size_t idx = 0; idx < vec_type.size(); ++idx) {
+          if (auto dist_type = vec_type[idx].dyn_cast<DistTypeInterface>()) {
+            meshes.insert(dist_type.process_mesh_attr());
+          }
+        }
+      }
+    }
+  }
+
+  ProcessMeshAttribute global_mesh;
+  PADDLE_ENFORCE_GT(meshes.size(),
+                    0,
+                    common::errors::InvalidArgument("There is no dist input"));
+  // get mesh that has the most dimensions
+  auto max_ndim_mesh = ProcessMeshAttribute::get(ctx, {}, {}, {});
+  int64_t min_ndim = std::numeric_limits<int64_t>::max();
+  for (const auto& mesh : meshes) {
+    if (mesh.ndim() > max_ndim_mesh.ndim()) {
+      max_ndim_mesh = mesh;
+    }
+    if (mesh.ndim() < min_ndim) {
+      min_ndim = mesh.ndim();
+    }
+  }
+  // min != max, means there are different mesh size
+  // so, the max_ndim_mesh should be the global mesh
+  if (min_ndim != max_ndim_mesh.ndim()) {
+    for (const auto& mesh : meshes) {
+      if (mesh != max_ndim_mesh) {
+        if (!phi::distributed::IsSubMesh(max_ndim_mesh.process_mesh(),
+                                         mesh.process_mesh())) {
+          PADDLE_THROW(common::errors::InvalidArgument(
+              "The small mesh should be the sub mesh of the large mesh, but "
+              "got {%s} vs {%s} ",
+              mesh,
+              max_ndim_mesh));
+        }
+      }
+    }
+    global_mesh = max_ndim_mesh;
+  } else {
+    auto it = meshes.begin();
+    auto first_mesh = *it;
+    if (meshes.size() > 1) {
+      auto global_ids = first_mesh.process_ids();
+      auto global_shape = first_mesh.shape();
+      auto global_names = first_mesh.dim_names();
+      ++it;
+      for (; it != meshes.end(); ++it) {
+        auto mesh = *it;
+        VLOG(4) << (mesh.shape() == first_mesh.shape()) << " "
+                << (mesh.dim_names() == first_mesh.dim_names()) << " "
+                << (mesh.process_ids() != first_mesh.process_ids());
+        if (mesh.shape() == first_mesh.shape() &&
+            mesh.dim_names() == first_mesh.dim_names() &&
+            mesh.process_ids() != first_mesh.process_ids()) {
+          global_ids.insert(global_ids.end(),
+                            mesh.process_ids().begin(),
+                            mesh.process_ids().end());
+        } else {
+          PADDLE_THROW(common::errors::InvalidArgument(
+              "The sub meshes should have same shape and names but different "
+              "process_ids, but got {%s} vs {%s} ",
+              first_mesh,
+              mesh));
+        }
+      }
+      global_shape.emplace(global_shape.begin(), meshes.size());
+      global_names.emplace(global_names.begin(), "global");
+      global_mesh = ProcessMeshAttribute::get(
+          ctx, global_shape, global_ids, global_names);
+    } else {
+      global_mesh = first_mesh;
+    }
+  }
+  return global_mesh;
+}
+
 bool AllInputAreDist(const std::vector<pir::Value>& inputs) {
   for (auto value : inputs) {
     auto type = value.type();
@@ -210,6 +350,22 @@ void CopyLeafOpToMesh(pir::Value value, ProcessMeshAttribute mesh_attr) {
       if (op->num_operands() != 0u || op->num_results() != 1u) {
         return;
       }
+      if (mesh_attr.ndim() > 1 &&
+          phi::distributed::IsSubMesh(
+              mesh_attr.process_mesh(),
+              dist_type.process_mesh_attr().process_mesh())) {
+        auto new_dist_type = dist_type.CopyWithNewMesh(mesh_attr);
+        value.set_type(new_dist_type);
+        op->set_attribute(
+            kAttrOpDistAttr,
+            OperationDistAttribute::get(new_dist_type.ir_context(),
+                                        mesh_attr,
+                                        {},
+                                        {new_dist_type.tensor_dist_attr()}));
+        VLOG(4) << "CopyLeafOpToMesh: change mesh from "
+                << dist_type.process_mesh_attr() << " to " << mesh_attr;
+        return;
+      }
       pir::IrMapping ir_mapping;
       auto new_op = op->Clone(ir_mapping);
       op->GetParent()->insert(*op, new_op);
@@ -222,6 +378,8 @@ void CopyLeafOpToMesh(pir::Value value, ProcessMeshAttribute mesh_attr) {
                                       mesh_attr,
                                       {},
                                       {dist_type.tensor_dist_attr()}));
+      VLOG(4) << "CopyLeafOpToMesh: copy value from "
+              << dist_type.process_mesh_attr() << " to " << mesh_attr;
     }
   }
 }
 
@@ -21,6 +21,13 @@
 namespace paddle {
 namespace dialect {
 
+ProcessMeshAttribute MergeMeshes(const ProcessMeshAttribute& mesh1,
+                                 const ProcessMeshAttribute& mesh2);
+
+ProcessMeshAttribute MergeInputMeshes(const std::vector<pir::Value>& inputs);
+
+ProcessMeshAttribute CreateGlobalMesh(const std::vector<pir::Value>& inputs);
+
 bool HasDistInput(const std::vector<pir::Value>& inputs,
                   ProcessMeshAttribute* p_mesh_attr = nullptr);
 bool AllInputAreDist(const std::vector<pir::Value>& inputs);
 
@@ -789,6 +789,7 @@ def GenDistBranch(args, op_info):
   // Auto Parallel condition
   ProcessMeshAttribute op_mesh;
   if(HasDistInput(input_values, &op_mesh)) {{
+    {}
     {}
     CvtAllInputsToDist(input_values, op_mesh);
     auto ctx = pir::IrContext::Instance();
@@ -799,7 +800,15 @@ def GenDistBranch(args, op_info):
         if name == "learning_rate":
             extra_call = "CopyLeafOpToMesh(learning_rate_, op_mesh);"
             break
-    dist_branch_str = TEMPLATE.format(extra_call)
+    merge_input_meshes = ""
+    if (
+        op_info.class_name == 'CheckFiniteAndUnscale_Op'
+        or op_info.class_name == 'UpdateLossScaling_Op'
+    ):
+        merge_input_meshes = "op_mesh = CreateGlobalMesh(input_values);"
+    if op_info.class_name == 'CheckFiniteAndUnscale_Op':
+        extra_call = "CopyLeafOpToMesh(scale_, op_mesh);"
+    dist_branch_str = TEMPLATE.format(merge_input_meshes, extra_call)
     infer_spmd_args_list = []
     # Prepare inputs_meta_tensor & attributes for infer spmd
     for name in op_info.spmd_params:
@@ -844,6 +853,7 @@ def GenDistBranch(args, op_info):
         spmd_rule_func = "VariadicReplicatedInferSpmdDynamic"
     TEMPLATE = """
     auto spmd_info = phi::distributed::{spmd_func}({args});
+    DebugInfoForInferSpmd("{op_name}", spmd_info);
     PADDLE_ENFORCE_EQ(spmd_info.first.size(), {input_size}u, common::errors::Unavailable(
         "Size of spmd_info.first for op[{op_name}]is unexpected."));
     for(auto& arg_dist : spmd_info.first) {{
 
@@ -93,6 +93,8 @@
     'c_allreduce_avg_',
     'c_reduce_avg',
     'c_reduce_avg_',
+    'c_allreduce_avg',
+    'c_allreduce_max',
     'c_reducescatter',
     'c_allreduce_min_',
     'c_allreduce_prod_',
@@ -161,8 +163,6 @@
     'assign_pos',
     'batch_fc',
     'barrier',
-    'c_allreduce_avg',
-    'c_allreduce_max',
     'c_allreduce_min',
     'c_allreduce_prod',
     'c_embedding',
 
@@ -52,6 +52,7 @@ const std::unordered_set<std::string> LegacyOpList = {
     CReduceSumOp::name(),
     CReduceSum_Op::name(),
     CAllreduceMax_Op::name(),
+    CAllreduceMaxOp::name(),
     CAllreduceMin_Op::name(),
     CAllgatherOp::name(),
     CSoftmaxWithCrossEntropyOp::name(),
 
@@ -677,12 +677,19 @@ std::vector<phi::distributed::DistTensor*> SetKernelDistOutput(
   // TODO(GhostScreaming): Inplace outputs are initialized, just set their
   // dist_attr.
   if (out->size() == out_size) {
-    VLOG(3) << "Outputs are inplace vector Tensors, just set their dist_attrs "
-            << "according to InferSPMD output result.";
+    VLOG(3) << "Outputs are inplace vector Tensors, SKIP set dist_attr for out "
+            << "to avoid changing the inplaced input";
     for (size_t i = 0; i < out_size; ++i) {
       results[i] =
           static_cast<phi::distributed::DistTensor*>(out->at(i).impl().get());
-      results[i]->unsafe_set_dist_attr(dist_attrs[i]);
+      continue;
+      // auto t =
+      //     static_cast<phi::distributed::DistTensor*>(out->at(i).impl().get());
+      // auto dist_t = std::make_shared<phi::distributed::DistTensor>(
+      //     t->shared_value(), t->dims(), dist_attrs[i]);
+      // out->at(i) = Tensor();
+      // out->at(i).set_impl(dist_t);
+      // results[i] = dist_t.get();
     }
   } else {
     out->reserve(out_size);
 
@@ -747,6 +747,9 @@ ReshardApiInputToKernelInput(phi::DeviceContext* dev_ctx,
     if (tensor_in) {
       phi::distributed::DistTensor* dist_tensor =
           static_cast<phi::distributed::DistTensor*>(tensor_in.get());
+      VLOG(4) << "ReshardIsNeededWithPartial"
+              << ReshardIsNeededWithPartial(dist_tensor->dist_attr(),
+                                            dist_attr);
       if (ReshardIsNeededWithPartial(dist_tensor->dist_attr(), dist_attr)) {
         auto argument_name =
             (arg_name.empty() ? "tensor" : arg_name) + "_" + std::to_string(i);
@@ -806,7 +809,7 @@ void SetInplaceOutputCorrectDistAttr(
     phi::distributed::DistTensor* dist_tensor =
         static_cast<phi::distributed::DistTensor*>(tensor_in.get());
     if (dist_tensor->initialized()) {
-      if (ReshardIsNeeded(dist_tensor->dist_attr(), dist_attr)) {
+      if (ReshardIsNeededWithPartial(dist_tensor->dist_attr(), dist_attr)) {
         if (use_general_spmd_rule) {
           VLOG(6) << "SetInplaceOutputCorrectDistAttr Reshard inplace output"
                   << " to origin dist_attr "
@@ -856,7 +859,8 @@ void SetInplaceOutputCorrectDistAttr(
       phi::distributed::DistTensor* dist_tensor =
           static_cast<phi::distributed::DistTensor*>(tensor_in.get());
       if (dist_tensor->initialized()) {
-        if (ReshardIsNeeded(dist_tensor->dist_attr(), dist_attr[i])) {
+        if (ReshardIsNeededWithPartial(dist_tensor->dist_attr(),
+                                       dist_attr[i])) {
           if (use_general_spmd_rule) {
             VLOG(6) << "SetInplaceOutputCorrectDistAttr Reshard inplace output"
                     << " to origin dist_attr "
 
@@ -130,6 +130,10 @@ class DistTensor final
   /// \return The DenseTensor value's const reference
   const DenseTensor& value() const { return *value_; }
 
+  /// \brief Returns the shared_ptr of dense tensor value's in dist tensor.
+  /// \return The shared_ptr of dense tensor value
+  std::shared_ptr<DenseTensor> shared_value() { return value_; }
+
   /// \brief Returns the mutable dense tensor value in dist tensor.
   /// \note If DenseTensor value is modified externally, the corresponding
   /// relationship between it and the current tensor's global dims and
 
@@ -17,6 +17,7 @@
 #include <vector>
 #include "glog/logging.h"
 
+#include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h"
 #include "paddle/phi/infermeta/spmd_rules/utils.h"
 
 namespace phi {
@@ -26,14 +27,21 @@ SpmdInfo CheckFiniteAndUnscaleSpmd(const std::vector<DistMetaTensor>& xs,
                                    const DistMetaTensor& scale) {
   std::vector<TensorDistAttr> xs_attrs;
   paddle::flat_hash_map<int64_t, ReduceType> partial_on_dims;
+  auto scale_mesh = scale.dist_attr().process_mesh();
+  auto offset = 0;
   for (auto& x : xs) {
     auto dist_attr = x.dist_attr();
     dist_attr.clean_partial_status();
     xs_attrs.emplace_back(dist_attr);
     auto dims_mapping = dist_attr.dims_mapping();
+    auto mesh = dist_attr.process_mesh();
+    if (scale_mesh.ndim() > 1 && IsSubMesh(scale_mesh, mesh)) {
+      partial_on_dims[0] = ReduceType::kRedMax;
+      offset = 1;
+    }
     for (auto& m : dims_mapping) {
       if (m != -1 && partial_on_dims.count(m) == 0) {
-        partial_on_dims[m] = ReduceType::kRedMax;
+        partial_on_dims[m + offset] = ReduceType::kRedMax;
       }
     }
   }
@@ -62,6 +70,7 @@ SpmdInfo UpdateLossScalingSpmd(const std::vector<DistMetaTensor>& xs,
   }
   TensorDistAttr found_infinite_attr =
       CopyTensorDistAttrForOutput(found_infinite.dist_attr());
+  found_infinite_attr.set_dims_mapping({-1});
   return {{xs_attrs,
            found_infinite_attr,
            prev_loss_scaling.dist_attr(),