elasticdeeplearning
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/collective_demo.md‎
Lines changed: 38 additions & 0 deletions b/‎doc/collective_demo.md‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎doc/collective_demo_cn.md‎
Lines changed: 37 additions & 0 deletions b/‎doc/collective_demo_cn.md‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎example/collective/resnet50/train_pretrain.sh‎
Lines changed: 2 additions & 2 deletions b/‎example/collective/resnet50/train_pretrain.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎example/collective/resnet50/train_with_fleet.py‎
Lines changed: 3 additions & 3 deletions b/‎example/collective/resnet50/train_with_fleet.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎python/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎python/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/edl/tests/unittests/start_edl_demo.sh‎
Lines changed: 0 additions & 2 deletions b/‎python/edl/tests/unittests/start_edl_demo.sh‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎python/edl/__init__.py‎ ‎python/paddle_edl/__init__.py‎python/edl/__init__.py renamed to python/paddle_edl/__init__.py b/‎python/edl/__init__.py‎ ‎python/paddle_edl/__init__.py‎python/edl/__init__.py renamed to python/paddle_edl/__init__.py
@@ -4,3 +4,6 @@ vendor/
 .vscode/
 *.pyc
 build/
+*.log
+resnet50_pod/
+.*.swp
@@ -4,7 +4,7 @@ set(EDL_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(EDL_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 SET(EDL_INSTALL_DIR ${CMAKE_BINARY_DIR}/output)
 SET(CMAKE_INSTALL_RPATH "$ORIGIN" "${CMAKE_INSTALL_RPATH}")
-project(paddle-edl)
+project(paddle_edl)
 
 include(python)
 
 
@@ -38,6 +38,7 @@ of EDL on this cluster is
   -  [Fault-Tolerant Training in PaddlePaddle](./doc/fault_tolerance.md).
   -  [Elastic Deep Learning Design Doc:compute engine](./doc/edl_collective_design_doc.md).
   -  [Elastic Deep Learning Design Doc:Scheduler](./doc/edl_design_doc.md).
+  -  [Run Elastic Deep Learning Demo on a sinle node](./doc/collective_demo.md).
 
 ## FAQ
 
 
@@ -0,0 +1,38 @@
+# Introduction
+This demo is for developers of EDL: you can test Paddle EDL function without a Kubernetes cluster. And it's simple to test it on a none or multiple nodes.
+Of course, this is also a toy. You can play with it!
+Have fun!
+
+# Install
+1. Install EDL from source
+
+```
+git clone https://github.com/PaddlePaddle/edl
+cd edl
+mkdir build & cd build
+cmake ..
+pip install ./python/dist/paddle_edl-0.0.0-py2-none-any.whl
+```
+
+2. Install EDL using `pip install paddle_edl`.  
+
+# Run the demo on a single node
+1. Start a Jobserver on one node. 
+
+```
+git clone https://github.com/PaddlePaddle/edl
+cd python/edl/demo/collective
+./start_job_server.sh
+```
+
+2. Start a Jobclient on every node. Jobclient controls the POD process.
+
+```
+#Set the ImageNet data path
+export PADDLE_EDL_IMAGENET_PATH=<your path>
+#Set the checkpoint path
+export PADDLE_EDL_FLEET_CHECKPOINT_PATH=<your path>
+
+mkdir -p resnet50_pod
+./start_job_client.sh
+```
@@ -0,0 +1,37 @@
+# 前言
+在单节点或者多个节点（物理机器或者虚拟机或者Docker之类的）搭建EDL主要是为开发者准备的：没有集群的情况下也可以对Paddle(计算引擎)模拟进行EDL的测试。
+当然，这个过程也有点意思，看着训练进程起起伏伏而且不影响最后的结果，还是蛮有意思的。
+Have fun!
+
+# 安装EDL
+1. 你可以从源代码编译安装
+
+```
+git clone https://github.com/PaddlePaddle/edl
+cd edl
+mkdir build & cd build
+cmake ..
+pip install ./python/dist/paddle_edl-0.0.0-py2-none-any.whl
+```
+
+2. 也可以直接使用`pip`安装我们发布的版本`pip install paddle_edl`
+
+# demo搭建步骤：以单节点为例
+1. 我们需要在一个节点上启动JobServer的demo，用来记录训练任务的Pod信息。
+
+```
+git clone https://github.com/PaddlePaddle/edl
+cd python/paddle_edl/demo/collective
+./start_job_server.sh
+```
+2. 我们需要在(各个)节点上启动一个JobClient的demo，用来管理训练的Pod进程。  
+
+```
+#指定ImageNet的数据目录路径
+export PADDLE_EDL_IMAGENET_PATH=<your path>
+#指定`checkpoint`的目录，用来保存checkpoint
+export PADDLE_EDL_FLEET_CHECKPOINT_PATH=<your path>
+
+mkdir -p resnet50_pod
+./start_job_client.sh
+```
@@ -1,7 +1,7 @@
 #!/bin/bash
 export FLAGS_sync_nccl_allreduce=1
 export FLAGS_cudnn_exhaustive_search=1
-export FLAGS_conv_workspace_size_limit=4000 #MB
+#export FLAGS_conv_workspace_size_limit=4000 #MB
 export FLAGS_cudnn_batchnorm_spatial_persistent=1
 
 export GLOG_v=1
@@ -18,7 +18,7 @@ if [[ ${use_dali} == "True" ]]; then
     export FLAGS_fraction_of_gpu_memory_to_use=0.8
 fi
 
-python -m paddle-edl.launch ${distributed_args} \
+python -m paddle_edl.collective.launch ${distributed_args} \
        --log_dir log \
        --log_level 20 \
        ./train_with_fleet.py \
 
@@ -87,13 +87,13 @@
 add_arg('do_test',          bool,  False,                 "Whether do test every epoch.")
 add_arg('use_gpu',          bool,  True,                 "Whether to use GPU or not.")
 add_arg('fuse', bool, False,                      "Whether to use tensor fusion.")
-add_arg('fuse_elewise_add_act_ops', bool, True,                      "Whether to use elementwise_act fusion.")
-add_arg('fuse_bn_act_ops', bool, True,                      "Whether to use bn_act fusion.")
+add_arg('fuse_elewise_add_act_ops', bool, False,                      "Whether to use elementwise_act fusion.")
+add_arg('fuse_bn_act_ops', bool, False,                      "Whether to use bn_act fusion.")
 add_arg('nccl_comm_num',        int,  1,                  "nccl comm num")
 add_arg("use_hierarchical_allreduce",     bool,   False,   "Use hierarchical allreduce or not.")
 add_arg('num_threads',        int,  1,                   "Use num_threads to run the fluid program.")
 add_arg('num_iteration_per_drop_scope', int,    100,      "Ihe iteration intervals to clean up temporary variables.")
-add_arg('benchmark_test',          bool,  True,                 "Whether to use print benchmark logs or not.")
+add_arg('benchmark_test',          bool,  False,                 "Whether to use print benchmark logs or not.")
 
 add_arg('use_dgc',           bool,  False,          "Whether use DGCMomentum Optimizer or not")
 add_arg('rampup_begin_step', int,   5008,           "The beginning step from which dgc is implemented.")
 
@@ -2,8 +2,8 @@ file(GLOB_RECURSE EDL_FILES collective/*.py demo/*.py demo/*.sh setup.py)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 add_custom_command(
 	OUTPUT ${EDL_BINARY_DIR}/.timestamp
-	COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/edl ${EDL_BINARY_DIR}/python/
+	COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/paddle_edl ${EDL_BINARY_DIR}/python/
 	COMMAND env ${py_env} ${PYTHON_EXECUTABLE} ./setup.py bdist_wheel
     DEPENDS ${EDL_FILES})
 add_custom_target(edl_python ALL DEPENDS ${EDL_BINARY_DIR}/.timestamp)
-add_subdirectory(edl/tests/unittests)
+add_subdirectory(paddle_edl/tests/unittests)