Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add runtime error info in eager boxing #7926

Merged
merged 21 commits into from
May 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
fbfecb2
add_runtime_error_info_in_eager_boxing
clackhan Mar 30, 2022
ba26cc0
refine
clackhan Mar 30, 2022
c2355a2
Merge branch 'master' into add_runtime_error_info_in_eager_boxing
clackhan Mar 30, 2022
1bc85c4
Merge branch 'master' into add_runtime_error_info_in_eager_boxing
clackhan Apr 20, 2022
a9c537e
Merge branch 'master' of https://github.com/Oneflow-Inc/oneflow into …
clackhan Apr 21, 2022
b7f192d
Merge branch 'master' of https://github.com/Oneflow-Inc/oneflow into …
clackhan Apr 21, 2022
f5f04a9
refine
clackhan Apr 21, 2022
203eb33
Merge branch 'master' into add_runtime_error_info_in_eager_boxing
clackhan May 24, 2022
4382f82
Merge branch 'master' into add_runtime_error_info_in_eager_boxing
clackhan May 25, 2022
666e74f
Merge branch 'master' into add_runtime_error_info_in_eager_boxing
clackhan May 26, 2022
2323e3b
Merge branch 'master' of https://github.com/Oneflow-Inc/oneflow into …
clackhan May 27, 2022
46b5259
refine
clackhan May 27, 2022
e7d6851
Merge branch 'master' into add_runtime_error_info_in_eager_boxing
clackhan May 27, 2022
4769792
add nolint
clackhan May 27, 2022
f2d391a
Merge branch 'add_runtime_error_info_in_eager_boxing' of https://gith…
clackhan May 27, 2022
982cc3b
Merge branch 'master' into add_runtime_error_info_in_eager_boxing
clackhan May 27, 2022
e7b0326
refine
clackhan May 27, 2022
12e18e2
use NOLINTBEGIN
clackhan May 27, 2022
f1768a6
Merge branch 'master' into add_runtime_error_info_in_eager_boxing
clackhan May 30, 2022
1c53a6a
Merge branch 'master' into add_runtime_error_info_in_eager_boxing
clackhan May 30, 2022
61bbe0a
Merge branch 'master' into add_runtime_error_info_in_eager_boxing
clackhan May 31, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions oneflow/core/boxing/asymmetric_broadcast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,14 @@ bool IsAllBroadcastNdSbp(Symbol<NdSbp> nd_sbp) {

Maybe<void> RawCheckAsymmetricBroadcast(Symbol<PlacedNdSbp> in, Symbol<PlacedNdSbp> out,
const Shape& logical_shape) {
// NOLINTBEGIN(maybe-need-error-msg)
CHECK_EQ_OR_RETURN(in->nd_sbp()->sbp_parallel_size(), 1);
CHECK_EQ_OR_RETURN(out->nd_sbp()->sbp_parallel_size(), 1);
CHECK_OR_RETURN(IsAllBroadcastNdSbp(in->nd_sbp()));
CHECK_OR_RETURN(IsAllBroadcastNdSbp(out->nd_sbp()));
CHECK_OR_RETURN(out->placement()->Bigger(*in->placement())
|| in->placement()->Bigger(*out->placement()));
// NOLINTEND(maybe-need-error-msg)
return Maybe<void>::Ok();
}

Expand All @@ -65,7 +67,15 @@ Maybe<int64_t> CalBroadcastRoot(Symbol<ParallelDesc> src_parallel_desc,
}
if (machine_and_device_id_inited) { break; }
}
CHECK_OR_RETURN(machine_id != -1 && device_id != -1);
// Always true, if check failed, there is a bug in oneflow needed to be resolved.
CHECK_OR_RETURN(machine_id != -1 && device_id != -1)
<< Error::RuntimeError()
<< "Calculate the intersection of placements "
"failed during execution of asymmetric broadcast,"
<< ", placement_a: " << *JUST(PlacementToString(src_parallel_desc))
<< ", placement_b: " << *JUST(PlacementToString(dst_parallel_desc))
<< "! Please submit an issue in `https://github.com/Oneflow-Inc/oneflow/issues` "
"and we will fix it as soon as possible";
return machine_id;
}

Expand All @@ -88,9 +98,14 @@ Maybe<one::Tensor> AsymmetricBroadcast(const std::shared_ptr<one::Tensor>& tenso
const auto& in_placement = in->placement();
const auto& out_placement = out->placement();
const auto& tensor_nd_sbp = JUST(tensor->nd_sbp());
CHECK_OR_RETURN(tensor_nd_sbp == in->nd_sbp());
CHECK_OR_RETURN(tensor_nd_sbp == in->nd_sbp())
<< Error::RuntimeError() << "The sbp of input tensor (" << NdSbpToString(tensor_nd_sbp)
<< ") must match the input sbp (" << NdSbpToString(in->nd_sbp()) << ")";
const auto& tensor_placement = JUST(tensor->parallel_desc());
CHECK_OR_RETURN(tensor_placement == in_placement);
CHECK_OR_RETURN(tensor_placement == in_placement)
<< Error::RuntimeError() << "The placement of input tensor ("
<< *JUST(PlacementToString(tensor_placement)) << ") must match the input placement ("
<< *JUST(PlacementToString(in_placement)) << ")";
std::shared_ptr<one::Tensor> local_tensor = JUST(tensor->cur_rank_phy_tensor());
if (out->placement()->Bigger(*in->placement())) {
const auto& out_parallel_id = JUST(GetParallelId4CurrentProcessCtx(out_placement));
Expand Down
15 changes: 11 additions & 4 deletions oneflow/core/boxing/boxing_dividor_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,15 @@ decltype(ReplaceOutDeviceType) ReplaceOutDeviceType =
namespace {

Maybe<Symbol<PlacedNdSbp>> RawFlattenHierarchy(Symbol<PlacedNdSbp> placed_nd_sbp) {
CHECK_GE_OR_RETURN(placed_nd_sbp->nd_sbp()->sbp_parallel_size(), 0);
CHECK_GE_OR_RETURN(placed_nd_sbp->nd_sbp()->sbp_parallel_size(), 0)
<< Error::RuntimeError() << "Invalid nd_sbp with ndim equal 0!";
const auto& first_sbp_parallel = placed_nd_sbp->nd_sbp()->sbp_parallel(0);
for (const auto& sbp_parallel : placed_nd_sbp->nd_sbp()->sbp_parallel()) {
CHECK_OR_RETURN(sbp_parallel == first_sbp_parallel);
CHECK_OR_RETURN(sbp_parallel == first_sbp_parallel)
<< Error::RuntimeError()
<< "Expected all sbps to be on the same in sbp list during flatten sbps list, but find at "
"least two sbps, "
<< SbpToString(first_sbp_parallel) << " and " << SbpToString(sbp_parallel) << "!";
}
std::vector<Symbol<SbpParallel>> vec{SymbolOf(first_sbp_parallel)};
const auto& flattened_nd_sbp = JUST(GetNdSbp(vec));
Expand All @@ -77,8 +82,10 @@ Maybe<BoxingDividor> RawFlattenInHierarchy() {

Maybe<Symbol<PlacedNdSbp>> RawUnflattenHierarchy(Symbol<PlacedNdSbp> in_placed_nd_sbp,
Symbol<PlacedNdSbp> out_placed_nd_sbp) {
CHECK_GE_OR_RETURN(in_placed_nd_sbp->nd_sbp()->sbp_parallel_size(), 0);
CHECK_GE_OR_RETURN(out_placed_nd_sbp->nd_sbp()->sbp_parallel_size(), 0);
CHECK_GE_OR_RETURN(in_placed_nd_sbp->nd_sbp()->sbp_parallel_size(), 0)
<< Error::RuntimeError() << "Invalid nd_sbp with ndim equal 0!";
CHECK_GE_OR_RETURN(out_placed_nd_sbp->nd_sbp()->sbp_parallel_size(), 0)
<< Error::RuntimeError() << "Invalid nd_sbp with ndim equal 0!";
const auto& in_sbp_parallel = in_placed_nd_sbp->nd_sbp()->sbp_parallel(0);
NdSbp unflattened_nd_sbp;
for (int64_t i = 0; i < out_placed_nd_sbp->nd_sbp()->sbp_parallel_size(); ++i) {
Expand Down
10 changes: 7 additions & 3 deletions oneflow/core/boxing/boxing_interpreter_status.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,18 @@ Maybe<BoxingInterpreterStatus> RawMakeBoxingInterpreterStatus(const std::string&
Maybe<BoxingInterpreterStatus> RawMakeComposedBoxingInterpreterStatus(
const std::shared_ptr<BoxingInterpreterStatus>& lhs_status,
const std::shared_ptr<BoxingInterpreterStatus>& rhs_status) {
CHECK_OR_RETURN(lhs_status->dst_placed_nd_sbp() == rhs_status->src_placed_nd_sbp())
CHECK_OR_RETURN(lhs_status->dst_placed_nd_sbp()
== rhs_status->src_placed_nd_sbp()) // always true
<< Error::RuntimeError()
<< "Intermediate placed_nd_sbp must be equal when compose boxing interpreter status"
<< ". lhs_status.dst_nd_sbp: " << NdSbpToString(lhs_status->dst_placed_nd_sbp()->nd_sbp())
<< ", rhs_status.dst_nd_sbp: " << NdSbpToString(rhs_status->src_placed_nd_sbp()->nd_sbp())
<< ", lhs_status.dst_placement: "
<< *JUST(PlacementToString(lhs_status->dst_placed_nd_sbp()->placement()))
<< ", rhs_status.dst_placement: "
<< *JUST(PlacementToString(rhs_status->src_placed_nd_sbp()->placement()));
CHECK_OR_RETURN(lhs_status->logical_shape() == rhs_status->logical_shape())
CHECK_OR_RETURN(lhs_status->logical_shape() == rhs_status->logical_shape()) // always true
<< Error::RuntimeError()
<< "Logical_shape must be equal when compose boxing interpreter status"
<< ". lhs_status.logical_shape: " << (lhs_status->logical_shape().ToString())
<< ". rhs_status.logical_shape: " << (rhs_status->logical_shape().ToString());
Expand Down Expand Up @@ -95,7 +98,8 @@ Maybe<std::string> RawGetPlacementRouting(
}

Maybe<std::string> RawGetBoxingDesc(Symbol<std::vector<std::string>> sorted_boxing_names) {
CHECK_OR_RETURN(!sorted_boxing_names->empty()) << "sorted_boxing_names can't be empty!";
CHECK_OR_RETURN(!sorted_boxing_names->empty()) // always true
<< Error::RuntimeError() << "boxing_names of eager boxing status can't be empty!";
std::ostringstream ss;
ss << sorted_boxing_names->at(0);
for (size_t i = 1; i < sorted_boxing_names->size(); ++i) {
Expand Down
58 changes: 38 additions & 20 deletions oneflow/core/boxing/ccl_boxing_function.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ bool IsAllSplitNdSbp(Symbol<NdSbp> nd_sbp, int64_t axis) {
return true;
}

// NOLINTBEGIN(maybe-need-error-msg)
Maybe<void> RawCheckCclP2B(Symbol<PlacedNdSbp> in, Symbol<PlacedNdSbp> out,
const Shape& logical_shape) {
CHECK_EQ_OR_RETURN(in->nd_sbp()->sbp_parallel_size(), 1);
Expand All @@ -54,9 +55,8 @@ Maybe<void> RawCheckCclP2B(Symbol<PlacedNdSbp> in, Symbol<PlacedNdSbp> out,
CHECK_OR_RETURN(IsAllBroadcastNdSbp(out->nd_sbp()));

CHECK_OR_RETURN(in->placement() == out->placement());
CHECK_OR_RETURN( // NOLINT(maybe-need-error-msg)
in->placement()->device_type() == DeviceType::kCPU // NOLINT(maybe-need-error-msg)
|| in->placement()->device_type() == DeviceType::kCUDA); // NOLINT(maybe-need-error-msg)
CHECK_OR_RETURN(in->placement()->device_type() == DeviceType::kCPU
|| in->placement()->device_type() == DeviceType::kCUDA);
return Maybe<void>::Ok();
}

Expand All @@ -73,9 +73,8 @@ Maybe<void> RawCheckCclP2S(Symbol<PlacedNdSbp> in, Symbol<PlacedNdSbp> out,
CHECK_OR_RETURN(logical_shape.At(0) % in->placement()->parallel_num() == 0);

CHECK_OR_RETURN(in->placement() == out->placement());
CHECK_OR_RETURN( // NOLINT(maybe-need-error-msg)
in->placement()->device_type() == DeviceType::kCPU // NOLINT(maybe-need-error-msg)
|| in->placement()->device_type() == DeviceType::kCUDA); // NOLINT(maybe-need-error-msg)
CHECK_OR_RETURN(in->placement()->device_type() == DeviceType::kCPU
|| in->placement()->device_type() == DeviceType::kCUDA);
return Maybe<void>::Ok();
}

Expand All @@ -93,9 +92,8 @@ Maybe<void> RawCheckCclS2B(Symbol<PlacedNdSbp> in, Symbol<PlacedNdSbp> out,
CHECK_OR_RETURN(logical_shape.At(0) % in->placement()->parallel_num() == 0);

CHECK_OR_RETURN(in->placement() == out->placement());
CHECK_OR_RETURN( // NOLINT(maybe-need-error-msg)
in->placement()->device_type() == DeviceType::kCPU // NOLINT(maybe-need-error-msg)
|| in->placement()->device_type() == DeviceType::kCUDA); // NOLINT(maybe-need-error-msg)
CHECK_OR_RETURN(in->placement()->device_type() == DeviceType::kCPU
|| in->placement()->device_type() == DeviceType::kCUDA);
return Maybe<void>::Ok();
}

Expand All @@ -119,50 +117,70 @@ Maybe<void> RawCheckCclS2S(Symbol<PlacedNdSbp> in, Symbol<PlacedNdSbp> out,
CHECK_OR_RETURN(logical_shape.At(out_split_axis) % in->placement()->parallel_num() == 0);

CHECK_OR_RETURN(in->placement() == out->placement());
CHECK_OR_RETURN( // NOLINT(maybe-need-error-msg)
in->placement()->device_type() == DeviceType::kCPU // NOLINT(maybe-need-error-msg)
|| in->placement()->device_type() == DeviceType::kCUDA); // NOLINT(maybe-need-error-msg)
CHECK_OR_RETURN(in->placement()->device_type() == DeviceType::kCPU
|| in->placement()->device_type() == DeviceType::kCUDA);
return Maybe<void>::Ok();
}

static constexpr auto* CheckCclS2S = DECORATE(&RawCheckCclS2S, ThreadLocalCachedCopiable);
// NOLINTEND(maybe-need-error-msg)

} // namespace

Maybe<one::Tensor> CclP2B(const std::shared_ptr<one::Tensor>& tensor, Symbol<PlacedNdSbp> in,
Symbol<PlacedNdSbp> out) {
const auto& tensor_nd_sbp = JUST(tensor->nd_sbp());
CHECK_OR_RETURN(tensor_nd_sbp == in->nd_sbp());
CHECK_OR_RETURN(tensor_nd_sbp == in->nd_sbp())
<< Error::RuntimeError() << "The sbp of input tensor (" << NdSbpToString(tensor_nd_sbp)
<< ") must match the input sbp (" << NdSbpToString(in->nd_sbp()) << ")";
const auto& tensor_placement = JUST(tensor->parallel_desc());
CHECK_OR_RETURN(tensor_placement == in->placement());
CHECK_OR_RETURN(tensor_placement == in->placement())
<< Error::RuntimeError() << "The placement of input tensor ("
<< *JUST(PlacementToString(tensor_placement)) << ") must match the input placement ("
<< *JUST(PlacementToString(in->placement())) << ")";
return JUST(one::functional::ConsistentAllReduce(tensor));
}

Maybe<one::Tensor> CclP2S(const std::shared_ptr<one::Tensor>& tensor, Symbol<PlacedNdSbp> in,
Symbol<PlacedNdSbp> out) {
const auto& tensor_nd_sbp = JUST(tensor->nd_sbp());
CHECK_OR_RETURN(tensor_nd_sbp == in->nd_sbp());
CHECK_OR_RETURN(tensor_nd_sbp == in->nd_sbp())
<< Error::RuntimeError() << "The sbp of input tensor (" << NdSbpToString(tensor_nd_sbp)
<< ") must match the input sbp (" << NdSbpToString(in->nd_sbp()) << ")";
const auto& tensor_placement = JUST(tensor->parallel_desc());
CHECK_OR_RETURN(tensor_placement == in->placement());
CHECK_OR_RETURN(tensor_placement == in->placement())
<< Error::RuntimeError() << "The placement of input tensor ("
<< *JUST(PlacementToString(tensor_placement)) << ") must match the input placement ("
<< *JUST(PlacementToString(in->placement())) << ")";

return JUST(one::functional::ConsistentReduceScatter(tensor, "sum"));
}

Maybe<one::Tensor> CclS2B(const std::shared_ptr<one::Tensor>& tensor, Symbol<PlacedNdSbp> in,
Symbol<PlacedNdSbp> out) {
const auto& tensor_nd_sbp = JUST(tensor->nd_sbp());
CHECK_OR_RETURN(tensor_nd_sbp == in->nd_sbp());
CHECK_OR_RETURN(tensor_nd_sbp == in->nd_sbp())
<< Error::RuntimeError() << "The sbp of input tensor (" << NdSbpToString(tensor_nd_sbp)
<< ") must match the input sbp (" << NdSbpToString(in->nd_sbp()) << ")";
const auto& tensor_placement = JUST(tensor->parallel_desc());
CHECK_OR_RETURN(tensor_placement == in->placement());
CHECK_OR_RETURN(tensor_placement == in->placement())
<< Error::RuntimeError() << "The placement of input tensor ("
<< *JUST(PlacementToString(tensor_placement)) << ") must match the input placement ("
<< *JUST(PlacementToString(in->placement())) << ")";
return JUST(one::functional::ConsistentAllGather(tensor));
}

Maybe<one::Tensor> CclS2S(const std::shared_ptr<one::Tensor>& tensor, Symbol<PlacedNdSbp> in,
Symbol<PlacedNdSbp> out) {
const auto& tensor_nd_sbp = JUST(tensor->nd_sbp());
CHECK_OR_RETURN(tensor_nd_sbp == in->nd_sbp());
CHECK_OR_RETURN(tensor_nd_sbp == in->nd_sbp())
<< Error::RuntimeError() << "The sbp of input tensor (" << NdSbpToString(tensor_nd_sbp)
<< ") must match the input sbp (" << NdSbpToString(in->nd_sbp()) << ")";
const auto& tensor_placement = JUST(tensor->parallel_desc());
CHECK_OR_RETURN(tensor_placement == in->placement());
CHECK_OR_RETURN(tensor_placement == in->placement())
<< Error::RuntimeError() << "The placement of input tensor ("
<< *JUST(PlacementToString(tensor_placement)) << ") must match the input placement ("
<< *JUST(PlacementToString(in->placement())) << ")";
return JUST(one::functional::ConsistentS2S(tensor, *JUST(GetSbpList(out->nd_sbp()))));
}

Expand Down
15 changes: 11 additions & 4 deletions oneflow/core/boxing/cuda_copy_boxing_interpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,13 @@ Maybe<bool> IgnoringDeviceTypeEqual(Symbol<ParallelDesc> lhs, Symbol<ParallelDes

} // namespace

// NOLINTBEGIN(maybe-need-error-msg)
Maybe<void> CheckCopyH2D(Symbol<PlacedNdSbp> in, Symbol<PlacedNdSbp> out,
const Shape& logical_shape) {
bool equal = JUST(IgnoringDeviceTypeEqual(in->placement(), out->placement()));
CHECK_OR_RETURN(equal);
CHECK_EQ_OR_RETURN(in->placement()->device_type(), DeviceType::kCPU);
CHECK_NE_OR_RETURN(out->placement()->device_type(), DeviceType::kCPU); // NOLINT
CHECK_NE_OR_RETURN(out->placement()->device_type(), DeviceType::kCPU);
CHECK_OR_RETURN(in->nd_sbp() == out->nd_sbp());
return Maybe<void>::Ok();
}
Expand All @@ -44,18 +45,24 @@ Maybe<void> CheckCopyD2H(Symbol<PlacedNdSbp> in, Symbol<PlacedNdSbp> out,
const Shape& logical_shape) {
bool equal = JUST(IgnoringDeviceTypeEqual(in->placement(), out->placement()));
CHECK_OR_RETURN(equal);
CHECK_NE_OR_RETURN(in->placement()->device_type(), DeviceType::kCPU); // NOLINT
CHECK_NE_OR_RETURN(in->placement()->device_type(), DeviceType::kCPU);
CHECK_EQ_OR_RETURN(out->placement()->device_type(), DeviceType::kCPU);
CHECK_OR_RETURN(in->nd_sbp() == out->nd_sbp());
return Maybe<void>::Ok();
}
// NOLINTEND(maybe-need-error-msg)

Maybe<one::Tensor> CopyBoxingFunction(const std::shared_ptr<one::Tensor>& tensor,
Symbol<PlacedNdSbp> in, Symbol<PlacedNdSbp> out) {
const auto& tensor_nd_sbp = JUST(tensor->nd_sbp());
CHECK_OR_RETURN(tensor_nd_sbp == in->nd_sbp());
CHECK_OR_RETURN(tensor_nd_sbp == in->nd_sbp())
<< Error::RuntimeError() << "The sbp of input tensor (" << NdSbpToString(tensor_nd_sbp)
<< ") must match the input sbp (" << NdSbpToString(in->nd_sbp()) << ")";
const auto& tensor_placement = JUST(tensor->parallel_desc());
CHECK_OR_RETURN(tensor_placement == in->placement());
CHECK_OR_RETURN(tensor_placement == in->placement())
<< Error::RuntimeError() << "The placement of input tensor ("
<< *JUST(PlacementToString(tensor_placement)) << ") must match the input placement ("
<< *JUST(PlacementToString(in->placement())) << ")";
std::shared_ptr<one::Tensor> local_tensor = JUST(tensor->cur_rank_phy_tensor());
const auto& out_parallel_id = JUST(GetParallelId4CurrentProcessCtx(out->placement()));
if (!out_parallel_id->has_value()) {
Expand Down
Loading