Skip to content

Commit bb58b63

Browse files
committed
Merge branch 'develop' into softmax_with_cross_entropy_op
2 parents f1d5fb3 + 5b5f4f5 commit bb58b63

File tree

152 files changed

+5972
-1156
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

152 files changed

+5972
-1156
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ endif()
6666

6767
if(ANDROID OR IOS)
6868
if(ANDROID)
69-
if(AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
69+
if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
7070
message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
7171
elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
7272
# TODO: support glog for Android api 16 ~ 19 in the future

cmake/generic.cmake

Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -106,22 +106,22 @@ function(merge_static_libs TARGET_NAME)
106106
endforeach()
107107
list(REMOVE_DUPLICATES libs_deps)
108108

109-
if(APPLE) # Use OSX's libtool to merge archives
110-
# To produce a library we need at least one source file.
111-
# It is created by add_custom_command below and will helps
112-
# also help to track dependencies.
113-
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
109+
# To produce a library we need at least one source file.
110+
# It is created by add_custom_command below and will helps
111+
# also help to track dependencies.
112+
set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
114113

114+
if(APPLE) # Use OSX's libtool to merge archives
115115
# Make the generated dummy source file depended on all static input
116116
# libs. If input lib changes,the source file is touched
117117
# which causes the desired effect (relink).
118-
add_custom_command(OUTPUT ${dummyfile}
119-
COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
118+
add_custom_command(OUTPUT ${target_SRCS}
119+
COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
120120
DEPENDS ${libs})
121121

122122
# Generate dummy staic lib
123-
file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
124-
add_library(${TARGET_NAME} STATIC ${dummyfile})
123+
file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
124+
add_library(${TARGET_NAME} STATIC ${target_SRCS})
125125
target_link_libraries(${TARGET_NAME} ${libs_deps})
126126

127127
foreach(lib ${libs})
@@ -130,43 +130,47 @@ function(merge_static_libs TARGET_NAME)
130130
endforeach()
131131
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
132132
COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
133-
COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
133+
COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
134+
)
134135
else() # general UNIX: use "ar" to extract objects and re-add to a common lib
136+
set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
137+
135138
foreach(lib ${libs})
136-
set(objlistfile ${lib}.objlist) # list of objects in the input library
137-
set(objdir ${lib}.objdir)
139+
set(objlistfile ${target_DIR}/${lib}.objlist) # list of objects in the input library
140+
set(objdir ${target_DIR}/${lib}.objdir)
138141

139142
add_custom_command(OUTPUT ${objdir}
140143
COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
141144
DEPENDS ${lib})
142145

143146
add_custom_command(OUTPUT ${objlistfile}
144147
COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
145-
COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ../${objlistfile}
148+
COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ${objlistfile}
146149
DEPENDS ${lib} ${objdir}
147150
WORKING_DIRECTORY ${objdir})
148151

149-
# Empty dummy source file that goes into merged library
150-
set(mergebase ${lib}.mergebase.c)
151-
add_custom_command(OUTPUT ${mergebase}
152-
COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
153-
DEPENDS ${objlistfile})
154-
155-
list(APPEND mergebases "${mergebase}")
152+
list(APPEND target_OBJS "${objlistfile}")
156153
endforeach()
157154

158-
add_library(${TARGET_NAME} STATIC ${mergebases})
155+
# Make the generated dummy source file depended on all static input
156+
# libs. If input lib changes,the source file is touched
157+
# which causes the desired effect (relink).
158+
add_custom_command(OUTPUT ${target_SRCS}
159+
COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
160+
DEPENDS ${libs} ${target_OBJS})
161+
162+
# Generate dummy staic lib
163+
file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
164+
add_library(${TARGET_NAME} STATIC ${target_SRCS})
159165
target_link_libraries(${TARGET_NAME} ${libs_deps})
160166

161167
# Get the file name of the generated library
162-
set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
168+
set(target_LIBNAME "$<TARGET_FILE:${TARGET_NAME}>")
163169

164-
foreach(lib ${libs})
165-
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
166-
COMMAND ${CMAKE_AR} cr ${outlibfile} *.o
167-
COMMAND ${CMAKE_RANLIB} ${outlibfile}
168-
WORKING_DIRECTORY ${lib}.objdir)
169-
endforeach()
170+
add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
171+
COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
172+
COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
173+
WORKING_DIRECTORY ${target_DIR})
170174
endif()
171175
endfunction(merge_static_libs)
172176

@@ -196,7 +200,7 @@ function(cc_library TARGET_NAME)
196200
add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
197201

198202
else(cc_library_SRCS)
199-
if (cc_library_DEPS)
203+
if(cc_library_DEPS)
200204
merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
201205
else()
202206
message(FATAL "Please specify source file or library in cc_library.")

cmake/util.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ function(target_circle_link_libraries TARGET_NAME)
2525
endif()
2626
endforeach()
2727
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
28-
if(IOS AND NOT IOS_ENABLE_BITCODE)
28+
if(NOT IOS_ENABLE_BITCODE)
2929
list(APPEND LIBS "-undefined dynamic_lookup")
3030
endif()
3131
endif()

doc/design/api.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
## Ingredients
44

55
As our design principle is starting from the essence: how could we
6-
allow users to express and solve their problems at neural networks.
6+
allow users to express and solve their problems as neural networks.
77
Some essential concepts that our API have to provide include:
88

99
1. A *topology* is an expression of *layers*.
@@ -233,7 +233,7 @@ paddle.dist_train(model,
233233
num_parameter_servers=15)
234234
```
235235

236-
The pseudo code if `paddle.dist_train` is as follows:
236+
The pseudo code of `paddle.dist_train` is as follows:
237237

238238
```python
239239
def dist_train(topology, parameters, trainer, reader, ...):

doc/design/auto_gradient_check.md

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
## Auto Gradient Checker Design
22

33
## Backgraound:
4-
- Operator forward computing is easy to check if the result is right because it has a clear definition. **But** backpropagation is a notoriously difficult algorithm to debug and get right:
5-
- 1. you should get the right backpropagation formula according to the forward computation.
6-
- 2. you should implement it right in CPP.
7-
- 3. it's difficult to prepare test data.
4+
- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right:
5+
1. you should get the right backpropagation formula according to the forward computation.
6+
2. you should implement it right in CPP.
7+
3. it's difficult to prepare test data.
88

9-
- Auto gradient check gets a numeric gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
10-
- 1. numeric gradient checker only need forward operator.
11-
- 2. user only need to prepare the input data for forward Operator.
9+
- Auto gradient checking gets a numerical gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
10+
1. numerical gradient checker only need forward operator.
11+
2. user only need to prepare the input data for forward Operator.
1212

1313
## Mathematical Theory
14-
The following two document from stanford has a detailed explanation of how to get numeric gradient and why it's useful.
14+
The following two document from Stanford has a detailed explanation of how to get numerical gradient and why it's useful.
1515

1616
- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
1717
- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
@@ -20,7 +20,7 @@ The following two document from stanford has a detailed explanation of how to ge
2020
## Numeric Gradient Implementation
2121
### Python Interface
2222
```python
23-
def get_numeric_gradient(op,
23+
def get_numerical_gradient(op,
2424
input_values,
2525
output_name,
2626
input_to_check,
@@ -30,13 +30,13 @@ def get_numeric_gradient(op,
3030
Get Numeric Gradient for an operator's input.
3131
3232
:param op: C++ operator instance, could be an network
33-
:param input_values: The input variables. Should be an dictionary, key is
34-
variable name. Value is numpy array.
33+
:param input_values: The input variables. Should be an dictionary, whose key is
34+
variable name, and value is numpy array.
3535
:param output_name: The final output variable name.
36-
:param input_to_check: The input variable need to get gradient.
36+
:param input_to_check: The input variable with respect to which to compute the gradient.
3737
:param delta: The perturbation value for numeric gradient method. The
3838
smaller delta is, the more accurate result will get. But if that delta is
39-
too small, it could occur numerical stability problem.
39+
too small, it will suffer from numerical stability problem.
4040
:param local_scope: The local scope used for get_numeric_gradient.
4141
:return: The gradient array in numpy format.
4242
"""
@@ -45,28 +45,28 @@ def get_numeric_gradient(op,
4545
### Explaination:
4646

4747
- Why need `output_name`
48-
- One Operator may have multiple Output, you can get independent gradient from each Output. So user should set one output to calculate.
48+
- An Operator may have multiple Output, one can get independent gradient from each Output. So caller should specify the name of the output variable.
4949

5050
- Why need `input_to_check`
51-
- One operator may have multiple inputs. Gradient Op can calculate the gradient of these Inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
51+
- One operator may have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
5252

5353

5454
### Core Algorithm Implementation
5555

5656

5757
```python
58-
# we only compute gradient of one element each time.
59-
# we use a for loop to compute the gradient of every element.
58+
# we only compute gradient of one element a time.
59+
# we use a for loop to compute the gradient of each element.
6060
for i in xrange(tensor_size):
61-
# get one input element throw it's index i.
61+
# get one input element by its index i.
6262
origin = tensor_to_check.get_float_element(i)
6363

64-
# add delta to it, run op and then get the sum of the result tensor.
64+
# add delta to it, run op and then get the new value of the result tensor.
6565
x_pos = origin + delta
6666
tensor_to_check.set_float_element(i, x_pos)
6767
y_pos = get_output()
6868

69-
# plus delta to this element, run op and get the sum of the result tensor.
69+
# plus delta to this element, run op and get the new value of the result tensor.
7070
x_neg = origin - delta
7171
tensor_to_check.set_float_element(i, x_neg)
7272
y_neg = get_output()
@@ -85,15 +85,15 @@ def get_numeric_gradient(op,
8585

8686
Each Operator Kernel has three kinds of Gradient:
8787

88-
- 1. Numeric Gradient
89-
- 2. CPU Operator Gradient
90-
- 3. GPU Operator Gradient(if supported)
88+
1. Numerical gradient
89+
2. CPU kernel gradient
90+
3. GPU kernel gradient (if supported)
9191

92-
Numeric Gradient Only relies on forward Operator. So we use Numeric Gradient as the reference value.
92+
The numerical gradient only relies on forward Operator. So we use the numerical gradient as the reference value. And the gradient checking is performed in the following three steps:
9393

94-
- 1. calculate the numeric gradient.
95-
- 2. calculate CPU kernel Gradient with the backward Operator and compare it with the numeric gradient.
96-
- 3. calculate GPU kernel Gradient with the backward Operator and compare it with the numeric gradient.(if support GPU)
94+
1. calculate the numerical gradient
95+
2. calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient
96+
3. calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient (if supported)
9797

9898
#### Python Interface
9999

@@ -110,8 +110,8 @@ Numeric Gradient Only relies on forward Operator. So we use Numeric Gradient as
110110
:param forward_op: used to create backward_op
111111
:param input_vars: numpy value of input variable. The following
112112
computation will use these variables.
113-
:param inputs_to_check: inputs var names that should check gradient.
114-
:param output_name: output name that used to
113+
:param inputs_to_check: the input variable with respect to which to compute the gradient.
114+
:param output_name: The final output variable name.
115115
:param max_relative_error: The relative tolerance parameter.
116116
:param no_grad_set: used when create backward ops
117117
:param only_cpu: only compute and check gradient on cpu kernel.
@@ -120,24 +120,24 @@ Numeric Gradient Only relies on forward Operator. So we use Numeric Gradient as
120120
```
121121

122122
### How to check if two numpy array is close enough?
123-
if `abs_numeric_grad` is nearly zero, then use abs error for numeric_grad, not relative
123+
if `abs_numerical_grad` is nearly zero, then use abs error for numerical_grad
124124

125125
```python
126-
numeric_grad = ...
126+
numerical_grad = ...
127127
operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
128128

129-
abs_numeric_grad = numpy.abs(numeric_grad)
130-
# if abs_numeric_grad is nearly zero, then use abs error for numeric_grad, not relative
129+
abs_numerical_grad = numpy.abs(numerical_grad)
130+
# if abs_numerical_grad is nearly zero, then use abs error for numeric_grad, not relative
131131
# error.
132-
abs_numeric_grad[abs_numeric_grad < 1e-3] = 1
132+
abs_numerical_grad[abs_numerical_grad < 1e-3] = 1
133133

134-
diff_mat = numpy.abs(abs_numeric_grad - operator_grad) / abs_numeric_grad
134+
diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
135135
max_diff = numpy.max(diff_mat)
136136
```
137137

138138

139139
#### Notes:
140-
1,The Input data for auto gradient checker should be reasonable to avoid numeric problem.
140+
The Input data for auto gradient checker should be reasonable to avoid numerical stability problem.
141141

142142

143143
#### Refs:

doc/design/functions_operators_layers.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,12 @@ Let's explain using an example. Suppose that we are going to compose the FC usi
5353
```python
5454
def operator.mul(X1, X2):
5555
O = Var()
56-
paddle.cpp.create_operator("mul", input={X1, Y1], output=O)
56+
paddle.cpp.create_operator("mul", input={X1, Y1}, output=O)
5757
return O
5858
5959
def operator.add(X1, X2):
6060
O = Var()
61-
paddle.cpp.create_operator("add", input={X1, X2], output=O)
61+
paddle.cpp.create_operator("add", input={X1, X2}, output=O)
6262
return O
6363
```
6464

doc/design/graph.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ For each parameter, like W and b created by `layer.fc`, marked as double circles
5656

5757
## Block and Graph
5858

59-
The word block and graph are interchangable in the desgin of PaddlePaddle. A [Block[(https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions. A graph of operators and variables is a representation of the block.
59+
The word block and graph are interchangable in the desgin of PaddlePaddle. A [Block](https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions. A graph of operators and variables is a representation of the block.
6060

6161
A Block keeps operators in an array `BlockDesc::ops`
6262

@@ -67,4 +67,4 @@ message BlockDesc {
6767
}
6868
```
6969

70-
in the order that there appear in user programs, like the Python program at the beginning of this article. We can imagine that in `ops`, we have some forward operators, followed by some gradient operators, and then some optimization operators.
70+
in the order that they appear in user programs, like the Python program at the beginning of this article. We can imagine that in `ops`, we have some forward operators, followed by some gradient operators, and then some optimization operators.

doc/design/parameters_in_cpp.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
# Design Doc: The C++ Class `Parameters`
22

3-
`Parameters` is a concept we designed in Paddle V2 API. `Parameters` is a container of parameters, and make Paddle can shared parameter between topologies. We described usages of `Parameter` in [api.md](./api.md).
3+
`Parameters` is a concept we designed in PaddlePaddle V2 API. `Parameters` is a container of parameters, which makes PaddlePaddle capable of sharing parameter between topologies. We described usages of `Parameter` in [api.md](./api.md).
44

5-
We used Python to implement Parameters when designing V2 API before. There are several defects for current implementation:
5+
We used Python to implement Parameters when designing V2 API before. There are several defects for the current implementation:
66
* We just use `memcpy` to share Parameters between topologies, but this is very inefficient.
7-
* We did not implement share Parameters while training. We just trigger `memcpy` when start training.
7+
* We did not support sharing Parameters while training. We just trigger `memcpy` when start training.
88

9-
It is necessary that we implement Parameters in CPP side. However, it could be a code refactoring for Paddle, because Paddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current Paddle implementation, there are three concepts associated with `Parameters`:
9+
It is necessary that we implement Parameters in CPP side. However, it could result a code refactoring for PaddlePaddle, because PaddlePaddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current PaddlePaddle implementation, there are three concepts associated with `Parameters`:
1010

1111
1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`.
1212
It is evident that we should use `paddle::Parameter` when developing `Parameters`.
1313
However, the `Parameter` class contains many functions and does not have a clear interface.
1414
It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`.
1515
When we developing `Parameters`, we only use `create/store Parameter` functionality.
16-
We should extract functionalities of Parameter into many classes to clean Paddle CPP implementation.
16+
We should extract functionalities of Parameter into many classes to clean PaddlePaddle CPP implementation.
1717

1818
2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`.
1919
We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies.
@@ -24,7 +24,7 @@ Also, we should handle multi-GPU/CPU training, because `forward` and `backward`
2424
So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD).
2525

2626

27-
The step by step approach for implementation Parameters in Paddle C++ core is listed below. Each step should be a PR and could be merged into Paddle one by one.
27+
The step by step approach for implementation Parameters in PaddlePaddle C++ core is listed below. Each step should be a PR and could be merged into PaddlePaddle one by one.
2828

2929
1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters.
3030

0 commit comments

Comments
 (0)