Description
When I use the latest commit to build a PMEM memkind environment and execute the launch script, the following error will appear.
2.The build option I used
bazel build --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" --host_cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" -c opt --copt="-L/usr/local/lib" --copt="-lpmem" --copt="-lmemkind" --config=opt //tensorflow/tools/pip_package:build_pip_package
-
The scprit I used
numactl -N 1 ./launch.sh --batch_size=1280 --dim_size=512 --max_mock_id_amplify=1800 --num_steps=2000 --ev_storage=pmem_memkind -
error logs
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Traceback (most recent call last):
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/client/session.py", line 1365, in _do_call
return fn(*args)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/client/session.py", line 1350, in _run_fn
target_list, run_metadata)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/client/session.py", line 1443, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.InvalidArgumentError: From /job:ps/replica:0/task:0:
MultiLevel EV's Cache size -1 should large than IDs in batch 1280
[[{{node fm/embedding_lookup_36}}]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "./benchmark.py", line 228, in
tf.app.run()
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/platform/app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/home/pai/lib/python3.6/site-packages/absl/app.py", line 312, in run
_run_main(main, args)
File "/home/pai/lib/python3.6/site-packages/absl/app.py", line 258, in _run_main
sys.exit(main(argv))
File "./benchmark.py", line 203, in main
sess.run(train_op)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 804, in run
run_metadata=run_metadata)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 1309, in run
run_metadata=run_metadata)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 1410, in run
raise six.reraise(*original_exc_info)
File "/home/pai/lib/python3.6/site-packages/six.py", line 719, in reraise
raise value
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 1395, in run
return self._sess.run(*args, **kwargs)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 1468, in run
run_metadata=run_metadata)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/training/monitored_session.py", line 1226, in run
return self._sess.run(*args, **kwargs)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/client/session.py", line 956, in run
run_metadata_ptr)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/client/session.py", line 1180, in _run
feed_dict_tensor, options, run_metadata)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/client/session.py", line 1359, in _do_run
run_metadata)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/client/session.py", line 1384, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: From /job:ps/replica:0/task:0:
MultiLevel EV's Cache size -1 should large than IDs in batch 1280
[[node fm/embedding_lookup_36 (defined at /home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1748) ]]
Original stack trace for 'fm/embedding_lookup_36':
File "./benchmark.py", line 228, in
tf.app.run()
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/platform/app.py", line 40, in run
_run(main=main, argv=argv, flags_parser=_parse_flags_tolerate_undef)
File "/home/pai/lib/python3.6/site-packages/absl/app.py", line 312, in run
_run_main(main, args)
File "/home/pai/lib/python3.6/site-packages/absl/app.py", line 258, in _run_main
sys.exit(main(argv))
File "./benchmark.py", line 121, in main
tf.nn.embedding_lookup(fm_w, batch['col{}'.format(sidx)]))
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/ops/embedding_ops.py", line 418, in embedding_lookup
counts=counts)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/ops/embedding_ops.py", line 184, in _embedding_lookup_and_transform
counts=counts),
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/util/dispatch.py", line 180, in wrapper
return target(*args, **kwargs)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/ops/array_ops.py", line 3958, in gather
counts=counts)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/ops/kv_variable_ops.py", line 749, in sparse_read
name=name)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/ops/gen_kv_variable_ops.py", line 647, in kv_resource_gather
validate_indices=validate_indices, name=name)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
op_def=op_def)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py", line 507, in new_func
return func(*args, **kwargs)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
attrs, op_def, compute_device)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
op_def=op_def)
File "/home/pai/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 1748, in init
self._traceback = tf_stack.extract_stack()
Activity