Skip to content

Commit be55bac

Browse files
authored
[new-exec] enable check_nan_inf (#36802)
* enable check_nan_inf and fix variable scope * add ut * fix bug * update ut * revert doc change * fix npu compile
1 parent 82fb63e commit be55bac

File tree

9 files changed

+108
-42
lines changed

9 files changed

+108
-42
lines changed

paddle/fluid/framework/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ cc_test(reader_test SRCS reader_test.cc DEPS reader)
117117
cc_library(threadpool SRCS threadpool.cc DEPS enforce)
118118
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
119119

120-
cc_library(var_type_traits SRCS var_type_traits.cc DEPS lod_tensor selected_rows framework_proto)
120+
cc_library(var_type_traits SRCS var_type_traits.cc DEPS lod_tensor selected_rows framework_proto scope)
121121
if (WITH_GPU)
122122
target_link_libraries(var_type_traits dynload_cuda)
123123
endif()

paddle/fluid/framework/details/nan_inf_utils.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ namespace framework {
2727
namespace details {
2828
// assert false when meets NAN or inf
2929
void CheckVarHasNanOrInf(const std::string& op_type,
30-
const framework::Scope& scope,
30+
const framework::ScopeBase& scope,
3131
const std::string& var_name,
3232
const platform::Place& place);
3333

@@ -37,7 +37,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
3737
const platform::Place& place);
3838

3939
void CheckOpHasNanOrInf(const framework::OperatorBase& op,
40-
const framework::Scope& scope,
40+
const framework::ScopeBase& scope,
4141
const platform::Place& place);
4242

4343
template <typename VarType>
@@ -55,7 +55,7 @@ void CheckOpHasNanOrInfInDygraph(const std::string& op_type,
5555

5656
#ifdef PADDLE_WITH_ASCEND_CL
5757
void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
58-
const framework::Scope& scope,
58+
const framework::ScopeBase& scope,
5959
const platform::Place& place);
6060
#endif
6161

paddle/fluid/framework/details/nan_inf_utils_detail.cc

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
407407
}
408408

409409
void CheckVarHasNanOrInf(const std::string& op_type,
410-
const framework::Scope& scope,
410+
const framework::ScopeBase& scope,
411411
const std::string& var_name,
412412
const platform::Place& place) {
413413
auto* var = scope.FindVar(var_name);
@@ -440,7 +440,7 @@ static framework::Tensor& npu_float_status() {
440440
}
441441

442442
void NPUAllocAndClearFloatStatus(const framework::OperatorBase& op,
443-
const framework::Scope& scope,
443+
const framework::ScopeBase& scope,
444444
const platform::Place& place) {
445445
if (!platform::is_npu_place(place)) return;
446446

@@ -505,7 +505,7 @@ void PrintNpuVarInfo(const std::string& op_type, const std::string& var_name,
505505
}
506506

507507
void PrintNPUOpValueInfo(const framework::OperatorBase& op,
508-
const framework::Scope& scope,
508+
const framework::ScopeBase& scope,
509509
const platform::Place& place) {
510510
LOG(WARNING) << "There are `nan` or `inf` in operator (" << op.Type()
511511
<< "), here we print some tensor value info of this op.";
@@ -523,7 +523,7 @@ void PrintNPUOpValueInfo(const framework::OperatorBase& op,
523523
}
524524

525525
static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
526-
const framework::Scope& scope,
526+
const framework::ScopeBase& scope,
527527
const platform::Place& place) {
528528
if (!platform::is_npu_place(place)) return;
529529

@@ -551,14 +551,13 @@ static void NPUCheckOpHasNanOrInf(const framework::OperatorBase& op,
551551

552552
if (sum >= 1.0) PrintNPUOpValueInfo(op, scope, place);
553553

554-
PADDLE_ENFORCE_LT(
555-
sum, 1.0, platform::errors::PreconditionNotMet(
556-
"Operator %s contains Nan/Inf.", op.DebugStringEx(&scope)));
554+
PADDLE_ENFORCE_LT(sum, 1.0, platform::errors::PreconditionNotMet(
555+
"Operator %s contains Nan/Inf.", op.Type()));
557556
}
558557
#endif
559558

560559
void CheckOpHasNanOrInf(const framework::OperatorBase& op,
561-
const framework::Scope& exec_scope,
560+
const framework::ScopeBase& exec_scope,
562561
const platform::Place& place) {
563562
std::call_once(white_list_init_flag, InitWhiteListFormEnv);
564563

paddle/fluid/framework/new_executor/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
set(INTERPRETERCORE_DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
22
lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
3-
graph_to_program_pass variable_helper timer monitor)
3+
graph_to_program_pass variable_helper timer monitor nan_inf_utils)
44

55
cc_library(workqueue SRCS workqueue.cc workqueue_utils.cc DEPS enforce)
66
cc_library(interpretercore_garbage_collector SRCS interpretercore_garbage_collector.cc DEPS workqueue ${DEVICE_EVENT_LIBS})

paddle/fluid/framework/new_executor/interpretercore.cc

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,15 @@
1717

1818
#include <unordered_set>
1919

20+
#include "paddle/fluid/framework/details/nan_inf_utils.h"
2021
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
2122
#include "paddle/fluid/platform/profiler.h"
2223

2324
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true,
2425
"Use inplace in new executor");
2526

27+
DECLARE_bool(check_nan_inf);
28+
2629
constexpr const char* kExceptionCaught = "ExceptionCaught";
2730

2831
namespace paddle {
@@ -80,7 +83,6 @@ paddle::framework::FetchList InterpreterCore::Run(
8083
auto FeedInput = [&] {
8184
for (size_t i = 0; i < feed_names_.size(); ++i) {
8285
auto* feed_var = global_scope_->Var(feed_names_[i]);
83-
8486
auto feed_tensor = feed_var->GetMutable<framework::LoDTensor>();
8587
feed_tensor->ShareDataWith(feed_tensors[i]);
8688
}
@@ -246,10 +248,10 @@ void InterpreterCore::BuildInplace() {
246248
auto outvar = global_scope_->Var(iterout->second[0]);
247249
if (invar && outvar) {
248250
instr.AddInplace(invar, outvar);
249-
VLOG(3) << "inplace " << op_base->Type() << " "
250-
<< global_scope_->VarDesc(iter->second[0])->Name()
251+
VLOG(3) << "inplace " << vec_instruction_[i].OpBase()->Type()
252+
<< " " << global_scope_->GetNameById(iter->second[0])
251253
<< " -> "
252-
<< global_scope_->VarDesc(iterout->second[0])->Name()
254+
<< global_scope_->GetNameById(iterout->second[0])
253255
<< std::endl;
254256
}
255257
}
@@ -330,6 +332,14 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
330332
platform::RecordEvent compute_event("Compute");
331333
instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
332334
}
335+
336+
// for debug nan/inf
337+
if (FLAGS_check_nan_inf) {
338+
VLOG(4) << "Check nan/inf";
339+
framework::details::CheckOpHasNanOrInf(
340+
*instr_node.OpBase(), *global_scope_,
341+
instr_node.DeviceContext().GetPlace());
342+
}
333343
}
334344

335345
void InterpreterCore::ExecuteInstructionList(

paddle/fluid/framework/new_executor/new_executor_defs.h

Lines changed: 49 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -471,44 +471,73 @@ struct VariableMetaInfo {
471471
paddle::framework::VarDesc* vardesc_;
472472
};
473473

474-
// TODO(Aurelius84): Consider inherit ScopeBase to unify interface.
475-
class VariableScope {
474+
// TODO(zhiqiu): Maybe we need to add rwlock for VariableScope?
475+
class VariableScope : public ScopeBase {
476476
public:
477477
Variable* FindVar(const std::string& name) const {
478-
if (!HasVar(name)) {
479-
return nullptr;
478+
auto it = name2id_.find(name);
479+
if (it != name2id_.end()) {
480+
PADDLE_ENFORCE_LT(it->second, var_list_.size(),
481+
platform::errors::NotFound(
482+
"The id(%d) of variable(%s) should not be larger "
483+
"than the size of variable list(%d).",
484+
it->second, name, var_list_.size()));
485+
return var_list_[it->second];
480486
}
481-
auto var_id = VarId(name);
482-
CheckExist(var_id);
483-
return var_list[var_id];
487+
return nullptr;
488+
}
489+
490+
// Get variable id by name, return -1 if not found
491+
int GetIdByName(const std::string& name) const {
492+
auto it = name2id_.find(name);
493+
if (it != name2id_.end()) {
494+
return it->second;
495+
}
496+
return -1;
497+
}
498+
499+
// Get variable name by id, return "" if not found
500+
std::string GetNameById(int id) const {
501+
// NOTE(zhiqiu): do not use vec_meta_info_[id].vardesc_->Name() since
502+
// vec_meta_info_[id] may be nullptr,
503+
// typically when the target variable is not existed in the original program
504+
// desc, but created by interpretercore.
505+
// For example, created and used by d2h_copy or h2d_copy operator.
506+
auto it =
507+
std::find_if(name2id_.begin(), name2id_.end(),
508+
[id](const auto& pair) { return pair.second == id; });
509+
if (it != name2id_.end()) {
510+
return it->first;
511+
}
512+
return "";
484513
}
485514

486515
bool HasVar(const std::string& name) const {
487-
return name2id.find(name) != name2id.end();
516+
return name2id_.find(name) != name2id_.end();
488517
}
489518

490519
int VarId(const std::string& name) const {
491520
CheckExist(name);
492-
return name2id.at(name);
521+
return name2id_.at(name);
493522
}
494523

495-
Variable* Var(int id) const { return var_list.at(id); }
524+
Variable* Var(int id) const { return var_list_.at(id); }
496525

497526
Variable* Var(const std::string& name) const {
498-
return var_list.at(VarId(name));
527+
return var_list_.at(VarId(name));
499528
}
500529

501-
size_t VarSize() const { return var_list.size(); }
530+
size_t VarSize() const { return var_list_.size(); }
502531

503532
void AddVar(const std::string& name, VarDesc* var_desc) { // NOLINT
504-
name2id[name] = VarSize();
533+
name2id_[name] = VarSize();
505534
auto v = new Variable();
506535
if (nullptr == var_desc) {
507536
v->GetMutable<LoDTensor>();
508537
} else {
509538
InitializeVariable(v, var_desc->GetType());
510539
}
511-
var_list.push_back(v);
540+
var_list_.push_back(v);
512541

513542
VariableMetaInfo info;
514543
info.var_ref_count_ = 0;
@@ -517,8 +546,8 @@ class VariableScope {
517546
}
518547

519548
void AddVar(const std::string& name, Variable& var) { // NOLINT
520-
name2id[name] = VarSize();
521-
var_list.push_back(&var);
549+
name2id_[name] = VarSize();
550+
var_list_.push_back(&var);
522551

523552
VariableMetaInfo info;
524553
info.var_ref_count_ = 0;
@@ -540,10 +569,10 @@ class VariableScope {
540569
}
541570

542571
void CheckExist(int id) const {
543-
PADDLE_ENFORCE_LT(id, var_list.size(),
572+
PADDLE_ENFORCE_LT(id, var_list_.size(),
544573
platform::errors::PreconditionNotMet(
545574
"Required var_id < %d, but received var_id = %d.",
546-
var_list.size(), id));
575+
var_list_.size(), id));
547576
}
548577

549578
void CheckExist(const std::string& name) const {
@@ -553,8 +582,8 @@ class VariableScope {
553582
}
554583

555584
private:
556-
std::vector<Variable*> var_list;
557-
std::map<std::string, int> name2id;
585+
std::vector<Variable*> var_list_;
586+
std::map<std::string, int> name2id_;
558587
std::vector<VariableMetaInfo> vec_meta_info_;
559588
};
560589

paddle/fluid/framework/scope.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,16 @@ class Variable;
3939
namespace paddle {
4040
namespace framework {
4141

42+
// TODO(zhiqiu): add more function in base class
43+
class ScopeBase {
44+
public:
45+
/// Find a variable in the scope or any of its ancestors. Returns
46+
/// nullptr if cannot find.
47+
/// Caller doesn't own the returned Variable.
48+
virtual Variable* FindVar(const std::string& name) const = 0;
49+
virtual ~ScopeBase() {}
50+
};
51+
4252
class Scope;
4353

4454
/**
@@ -49,7 +59,7 @@ class Scope;
4959
* One net can run in different scopes and update different variable in the
5060
* scope.
5161
*/
52-
class Scope {
62+
class Scope : public ScopeBase {
5363
public:
5464
Scope() {}
5565
~Scope();

paddle/fluid/framework/var_type_traits.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ class BKCLCommunicator;
6969

7070
namespace framework {
7171
class LoDRankTable;
72+
class ScopeBase;
7273
class LoDTensor;
7374
class ReaderHolder;
7475
class Scope;

python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -256,10 +256,12 @@ def build_program(self):
256256
main_program = paddle.static.Program()
257257
startup_program = paddle.static.Program()
258258
with paddle.static.program_guard(main_program, startup_program):
259-
w = paddle.rand([10, 20])
259+
w = paddle.rand([10, 3])
260260
ids = paddle.static.data(name="id", shape=[5], dtype='int64')
261+
data = paddle.static.data(name="data", shape=[3], dtype='float32')
261262
emb = paddle.nn.functional.embedding(
262263
x=ids, weight=w, sparse=False, name="embedding")
264+
emb = emb + data
263265

264266
return main_program, startup_program, emb
265267

@@ -273,7 +275,7 @@ def _run(self, feeds):
273275

274276
for feed in feeds:
275277
out = exe.run(main_program, feed=feed, fetch_list=fetch_vars)
276-
278+
print(out)
277279
return out
278280

279281
def run_new_executor(self, feed):
@@ -284,12 +286,27 @@ def run_new_executor(self, feed):
284286

285287
def test_exception(self):
286288
feed = [{
287-
'id': np.array([1, 2, 3, 4, 5]).astype(np.int64)
289+
'id': np.array([1, 2, 3, 4, 5]).astype(np.int64),
290+
'data': np.array([1, 2, 3, 4]).astype(np.float32),
288291
}, {
289-
'id': np.array([1, 2, 3, 4, 11]).astype(np.int64)
292+
'id': np.array([1, 2, 3, 4, 11]).astype(np.int64),
293+
'data': np.array([1, 2, 3, 4]).astype(np.float32),
290294
}]
291295
self.assertRaises(ValueError, self.run_new_executor, feed)
292296

297+
def test_nan(self):
298+
flags = {'FLAGS_check_nan_inf': True}
299+
paddle.fluid.set_flags(flags)
300+
feed = [{
301+
'id': np.array([1, 2, 3, 4, 5]).astype(np.int64),
302+
'data': np.array([1, 2, 3]).astype(np.float32),
303+
}, {
304+
'id': np.array([1, 2, 3, 4, 5]).astype(np.int64),
305+
'data': np.array([1, 2, 3]).astype(np.float32),
306+
}]
307+
feed[1]['data'][0] = np.nan
308+
self.assertRaises(RuntimeError, self.run_new_executor, feed)
309+
293310

294311
if __name__ == "__main__":
295312
unittest.main()

0 commit comments

Comments
 (0)