Skip to content

Commit ffd1fae

Browse files
authored
Add EDL use toturial. (#67)
1 parent ad07ac4 commit ffd1fae

29 files changed

+142
-44
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@ vendor/
44
.vscode/
55
*.pyc
66
build/
7+
*.log
8+
resnet50_pod/
9+
.*.swp

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ set(EDL_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
44
set(EDL_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
55
SET(EDL_INSTALL_DIR ${CMAKE_BINARY_DIR}/output)
66
SET(CMAKE_INSTALL_RPATH "$ORIGIN" "${CMAKE_INSTALL_RPATH}")
7-
project(paddle-edl)
7+
project(paddle_edl)
88

99
include(python)
1010

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ of EDL on this cluster is
3838
- [Fault-Tolerant Training in PaddlePaddle](./doc/fault_tolerance.md).
3939
- [Elastic Deep Learning Design Doc:compute engine](./doc/edl_collective_design_doc.md).
4040
- [Elastic Deep Learning Design Doc:Scheduler](./doc/edl_design_doc.md).
41+
- [Run Elastic Deep Learning Demo on a sinle node](./doc/collective_demo.md).
4142

4243
## FAQ
4344

doc/collective_demo.md

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Introduction
2+
This demo is for developers of EDL: you can test Paddle EDL function without a Kubernetes cluster. And it's simple to test it on a none or multiple nodes.
3+
Of course, this is also a toy. You can play with it!
4+
Have fun!
5+
6+
# Install
7+
1. Install EDL from source
8+
9+
```
10+
git clone https://github.com/PaddlePaddle/edl
11+
cd edl
12+
mkdir build & cd build
13+
cmake ..
14+
pip install ./python/dist/paddle_edl-0.0.0-py2-none-any.whl
15+
```
16+
17+
2. Install EDL using `pip install paddle_edl`.
18+
19+
# Run the demo on a single node
20+
1. Start a Jobserver on one node.
21+
22+
```
23+
git clone https://github.com/PaddlePaddle/edl
24+
cd python/edl/demo/collective
25+
./start_job_server.sh
26+
```
27+
28+
2. Start a Jobclient on every node. Jobclient controls the POD process.
29+
30+
```
31+
#Set the ImageNet data path
32+
export PADDLE_EDL_IMAGENET_PATH=<your path>
33+
#Set the checkpoint path
34+
export PADDLE_EDL_FLEET_CHECKPOINT_PATH=<your path>
35+
36+
mkdir -p resnet50_pod
37+
./start_job_client.sh
38+
```

doc/collective_demo_cn.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# 前言
2+
在单节点或者多个节点(物理机器或者虚拟机或者Docker之类的)搭建EDL主要是为开发者准备的:没有集群的情况下也可以对Paddle(计算引擎)模拟进行EDL的测试。
3+
当然,这个过程也有点意思,看着训练进程起起伏伏而且不影响最后的结果,还是蛮有意思的。
4+
Have fun!
5+
6+
# 安装EDL
7+
1. 你可以从源代码编译安装
8+
9+
```
10+
git clone https://github.com/PaddlePaddle/edl
11+
cd edl
12+
mkdir build & cd build
13+
cmake ..
14+
pip install ./python/dist/paddle_edl-0.0.0-py2-none-any.whl
15+
```
16+
17+
2. 也可以直接使用`pip`安装我们发布的版本`pip install paddle_edl`
18+
19+
# demo搭建步骤:以单节点为例
20+
1. 我们需要在一个节点上启动JobServer的demo,用来记录训练任务的Pod信息。
21+
22+
```
23+
git clone https://github.com/PaddlePaddle/edl
24+
cd python/paddle_edl/demo/collective
25+
./start_job_server.sh
26+
```
27+
2. 我们需要在(各个)节点上启动一个JobClient的demo,用来管理训练的Pod进程。
28+
29+
```
30+
#指定ImageNet的数据目录路径
31+
export PADDLE_EDL_IMAGENET_PATH=<your path>
32+
#指定`checkpoint`的目录,用来保存checkpoint
33+
export PADDLE_EDL_FLEET_CHECKPOINT_PATH=<your path>
34+
35+
mkdir -p resnet50_pod
36+
./start_job_client.sh
37+
```

example/collective/resnet50/train_pretrain.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/bin/bash
22
export FLAGS_sync_nccl_allreduce=1
33
export FLAGS_cudnn_exhaustive_search=1
4-
export FLAGS_conv_workspace_size_limit=4000 #MB
4+
#export FLAGS_conv_workspace_size_limit=4000 #MB
55
export FLAGS_cudnn_batchnorm_spatial_persistent=1
66

77
export GLOG_v=1
@@ -18,7 +18,7 @@ if [[ ${use_dali} == "True" ]]; then
1818
export FLAGS_fraction_of_gpu_memory_to_use=0.8
1919
fi
2020

21-
python -m paddle-edl.launch ${distributed_args} \
21+
python -m paddle_edl.collective.launch ${distributed_args} \
2222
--log_dir log \
2323
--log_level 20 \
2424
./train_with_fleet.py \

example/collective/resnet50/train_with_fleet.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -87,13 +87,13 @@
8787
add_arg('do_test', bool, False, "Whether do test every epoch.")
8888
add_arg('use_gpu', bool, True, "Whether to use GPU or not.")
8989
add_arg('fuse', bool, False, "Whether to use tensor fusion.")
90-
add_arg('fuse_elewise_add_act_ops', bool, True, "Whether to use elementwise_act fusion.")
91-
add_arg('fuse_bn_act_ops', bool, True, "Whether to use bn_act fusion.")
90+
add_arg('fuse_elewise_add_act_ops', bool, False, "Whether to use elementwise_act fusion.")
91+
add_arg('fuse_bn_act_ops', bool, False, "Whether to use bn_act fusion.")
9292
add_arg('nccl_comm_num', int, 1, "nccl comm num")
9393
add_arg("use_hierarchical_allreduce", bool, False, "Use hierarchical allreduce or not.")
9494
add_arg('num_threads', int, 1, "Use num_threads to run the fluid program.")
9595
add_arg('num_iteration_per_drop_scope', int, 100, "Ihe iteration intervals to clean up temporary variables.")
96-
add_arg('benchmark_test', bool, True, "Whether to use print benchmark logs or not.")
96+
add_arg('benchmark_test', bool, False, "Whether to use print benchmark logs or not.")
9797

9898
add_arg('use_dgc', bool, False, "Whether use DGCMomentum Optimizer or not")
9999
add_arg('rampup_begin_step', int, 5008, "The beginning step from which dgc is implemented.")

python/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@ file(GLOB_RECURSE EDL_FILES collective/*.py demo/*.py demo/*.sh setup.py)
22
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
33
add_custom_command(
44
OUTPUT ${EDL_BINARY_DIR}/.timestamp
5-
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/edl ${EDL_BINARY_DIR}/python/
5+
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_edl ${EDL_BINARY_DIR}/python/
66
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} ./setup.py bdist_wheel
77
DEPENDS ${EDL_FILES})
88
add_custom_target(edl_python ALL DEPENDS ${EDL_BINARY_DIR}/.timestamp)
9-
add_subdirectory(edl/tests/unittests)
9+
add_subdirectory(paddle_edl/tests/unittests)

python/edl/tests/unittests/start_edl_demo.sh

Lines changed: 0 additions & 2 deletions
This file was deleted.

0 commit comments

Comments
 (0)