Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions paddle/capi/Matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
}

PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
paddle_real* value) {
paddle_real* value) {
if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
auto ptr = cast(mat);
if (ptr->mat == nullptr) return kPD_NULLPTR;
Expand All @@ -75,7 +75,7 @@ PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
}

PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
paddle_real* result) {
paddle_real* result) {
if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
auto ptr = cast(mat);
if (ptr->mat == nullptr) return kPD_NULLPTR;
Expand Down
8 changes: 4 additions & 4 deletions paddle/capi/matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
* @note value should contain enough element of data to init the mat
*/
PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
paddle_real* value);
paddle_real* value);

/**
* @brief PDMatGetRow Get raw row buffer from matrix
Expand All @@ -93,14 +93,14 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
paddle_real** rawRowBuffer);

/**
* @brief copy data from the matrix
* @brief copy data from the matrix
* @param [in] mat Target matrix
* @param [out] result pointer to store the matrix data
* @param [out] result pointer to store the matrix data
* @return paddle_error
* @note the space of the result should allocated before invoke this API
*/
PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
paddle_real* result);
paddle_real* result);
/**
* @brief PDMatCreateNone Create None Matrix
* @return
Expand Down
9 changes: 4 additions & 5 deletions paddle/framework/tensor_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -135,18 +135,17 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
auto dst_ptr = static_cast<void*>(dst->data());

if (platform::is_cpu_place(src.place())) {
memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()),
src_ptr, size);
memory::Copy(dst_place, dst_ptr,
boost::get<platform::CPUPlace>(src.place()), src_ptr, size);
}
#ifdef PADDLE_WITH_CUDA
else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy(
dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()), src_ptr,
size,
dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()),
src_ptr, size,
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
}
#endif

}

} // namespace framework
Expand Down
31 changes: 13 additions & 18 deletions paddle/operators/math/maxouting.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ template <typename T>
class MaxOutFunctor<platform::CPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
framework::Tensor * output,
const framework::Tensor& input, framework::Tensor* output,
int groups) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
Expand All @@ -37,34 +36,30 @@ class MaxOutFunctor<platform::CPUPlace, T> {
T* output_data = output->mutable_data<T>(context.GetPlace());

for (int i = 0; i < batch_size; ++i) {
int new_bindex = c_size * i;
int new_bindex = c_size * i;
for (int c = 0; c < output_channels; ++c) {
int new_cindex = fea_size * c;
for (int f = 0; f < fea_size; ++f) {
T ele = static_cast<T>(-FLT_MAX);
for (int ph = 0; ph < groups; ++ph) {
T x = input_data[(new_bindex + new_cindex) * groups
+ ph * fea_size + f];
T x = input_data[(new_bindex + new_cindex) * groups +
ph * fea_size + f];
ele = ele > x ? ele : x;
}
output_data[(new_bindex+new_cindex+f)] = ele;
output_data[(new_bindex + new_cindex + f)] = ele;
}
}
}
}
};



template <class T>
class MaxOutGradFunctor<platform::CPUPlace, T> {
public:
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
framework::Tensor * input_grad,
const framework::Tensor& input, framework::Tensor* input_grad,
const framework::Tensor& output,
const framework::Tensor& output_grad,
int groups) {
const framework::Tensor& output_grad, int groups) {
const int batch_size = input.dims()[0];
const int input_height = input.dims()[2];
const int input_width = input.dims()[3];
Expand All @@ -84,11 +79,11 @@ class MaxOutGradFunctor<platform::CPUPlace, T> {
bool continue_match = true;
int output_idx = blen + clen + f;
for (int g = 0; g < groups && continue_match; ++g) {
int input_idx = input_idx0 + fea_size * g;
if (input_data[input_idx] == output_data[output_idx]) {
input_grad_data[input_idx] += output_grad_data[output_idx];
continue_match = false;
}
int input_idx = input_idx0 + fea_size * g;
if (input_data[input_idx] == output_data[output_idx]) {
input_grad_data[input_idx] += output_grad_data[output_idx];
continue_match = false;
}
}
}
}
Expand Down
80 changes: 39 additions & 41 deletions paddle/operators/math/maxouting.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ namespace math {

template <typename T>
__global__ void KernelMaxOut(const int nthreads, const T* input_data,
const int channels,
const int input_height, const int input_width,
int groups, T* output_data ) {
const int channels, const int input_height,
const int input_width, int groups,
T* output_data) {
const int size = input_height * input_width * channels / groups;
const int feat_len = input_height * input_width;
int index = blockIdx.x * blockDim.x + threadIdx.x;
Expand All @@ -34,7 +34,7 @@ __global__ void KernelMaxOut(const int nthreads, const T* input_data,
int channel_idx = batch_offset / feat_len;
int feat_idx = batch_offset % feat_len;
int data_idx =
(batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
(batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
T ele = static_cast<T>(-FLT_MAX);
for (int g = 0; g < groups; ++g) {
T x = input_data[data_idx + g * feat_len];
Expand All @@ -44,34 +44,35 @@ __global__ void KernelMaxOut(const int nthreads, const T* input_data,
}
}
template <typename T>
__global__ void KernelMaxoutGrad(
const int nthreads, const T* input_data, const T* output_data,
const T* output_grad, T* input_grad, const int channels,
const int input_height, const int input_width, int groups) {
const int size = input_height * input_width * channels / groups;
const int feat_len = input_height * input_width;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (int i = index; i < nthreads; i += offset) {
int batch_idx = i / size;
int batch_offset = i % size;
int channel_idx = batch_offset / feat_len;
int feat_idx = batch_offset % feat_len;
int data_idx =
__global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
const T* output_data, const T* output_grad,
T* input_grad, const int channels,
const int input_height, const int input_width,
int groups) {
const int size = input_height * input_width * channels / groups;
const int feat_len = input_height * input_width;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int offset = blockDim.x * gridDim.x;
for (int i = index; i < nthreads; i += offset) {
int batch_idx = i / size;
int batch_offset = i % size;
int channel_idx = batch_offset / feat_len;
int feat_idx = batch_offset % feat_len;
int data_idx =
(batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
int max_index = -1;
bool continue_match = true;
for (int g = 0; g < groups && continue_match; ++g) {
if (input_data[data_idx + g * feat_len] == output_data[i]) {
max_index = data_idx + g * feat_len;
continue_match = false;
break;
}
}
if (max_index != -1) {
input_grad[max_index] += output_grad[index];
int max_index = -1;
bool continue_match = true;
for (int g = 0; g < groups && continue_match; ++g) {
if (input_data[data_idx + g * feat_len] == output_data[i]) {
max_index = data_idx + g * feat_len;
continue_match = false;
break;
}
}
if (max_index != -1) {
input_grad[max_index] += output_grad[index];
}
}
}
/*
* All tensors are in NCHW format.
Expand All @@ -80,7 +81,7 @@ template <typename T>
class MaxOutFunctor<platform::GPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor * output,
const framework::Tensor& input, framework::Tensor* output,
int groups) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
Expand All @@ -92,7 +93,7 @@ class MaxOutFunctor<platform::GPUPlace, T> {

const T* input_data = input.data<T>();
T* output_data = output->mutable_data<T>(context.GetPlace());
int nthreads = output->numel();
int nthreads = output->numel();
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);
Expand All @@ -101,8 +102,7 @@ class MaxOutFunctor<platform::GPUPlace, T> {
T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(nthreads, input_data, input_channels,
input_height, input_width, groups,
output_data);
input_height, input_width, groups, output_data);
}
};
/*
Expand All @@ -112,11 +112,9 @@ template <typename T>
class MaxOutGradFunctor<platform::GPUPlace, T> {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
framework::Tensor * input_grad,
const framework::Tensor& input, framework::Tensor* input_grad,
const framework::Tensor& output,
const framework::Tensor& output_grad,
int groups) {
const framework::Tensor& output_grad, int groups) {
const int batch_size = input.dims()[0];
const int input_channels = input.dims()[1];
const int input_height = input.dims()[2];
Expand All @@ -129,17 +127,17 @@ class MaxOutGradFunctor<platform::GPUPlace, T> {
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
int nthreads = output.numel();
int nthreads = output.numel();
int blocks = (nthreads + 1024 - 1) / 1024;
dim3 threads(1024, 1);
dim3 grid(blocks, 1);

KernelMaxoutGrad<
T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
nthreads, input_data, output_data, output_grad_data, input_grad_data,
input_channels, input_height, input_width, groups);
.stream()>>>(nthreads, input_data, output_data,
output_grad_data, input_grad_data, input_channels,
input_height, input_width, groups);
}
};

Expand Down
8 changes: 3 additions & 5 deletions paddle/operators/math/maxouting.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,22 @@ namespace paddle {
namespace operators {
namespace math {

#define FLT_MAX \
__FLT_MAX__
#define FLT_MAX __FLT_MAX__

template <typename Place, typename T>

class MaxOutFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor * output,
const framework::Tensor& input, framework::Tensor* output,
int groups);
};

template <typename Place, class T>
class MaxOutGradFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
framework::Tensor * input_grad,
const framework::Tensor& input, framework::Tensor* input_grad,
const framework::Tensor& output,
const framework::Tensor& output_grad, int groups);
};
Expand Down
38 changes: 18 additions & 20 deletions paddle/operators/maxout_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,17 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
public:
MaxOutOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
AddInput(
"X",
"(Tensor) The input tensor of maxout operator. "
"The format of input tensor is NCHW. Where N is batch size, C is the "
"number of channels, H and W is the height and width of feature.");
AddOutput("Out",
"(Tensor) The output tensor of maxout operator."
"The format of output tensor is also NCHW."
"Where N is batch size, C is "
"the number of channels, H and W is the height and "
"width of feature.");
"(Tensor) The output tensor of maxout operator."
"The format of output tensor is also NCHW."
"Where N is batch size, C is "
"the number of channels, H and W is the height and "
"width of feature.");
AddAttr<int>(
"groups",
R"DOC("Specifies how many groups the input tensor will be split"
Expand Down Expand Up @@ -59,21 +60,19 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
}
};


class MaxOutOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MaxoutOp"
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of MaxoutOp"
"should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of MaxoutOp should not be null.");
auto in_x_dims = ctx->GetInputDim("X");
int groups = ctx->Attrs().Get<int>("groups");
// check groups > 1
PADDLE_ENFORCE_GT(
groups, 1,
"groups should be larger than 1 in maxoutop");
PADDLE_ENFORCE_GT(groups, 1, "groups should be larger than 1 in maxoutop");
std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1] / groups});
output_shape.push_back(in_x_dims[2]);
output_shape.push_back(in_x_dims[3]);
Expand All @@ -87,18 +86,17 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Input(X@GRAD) should not be null.");
"Input(X@GRAD) should not be null.");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
};
} // namespace operators
} // namespace paddle
} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
ops::MaxOutOpGrad);
REGISTER_OP_CPU_KERNEL(maxout, ops::MaxOutKernel<paddle::platform::CPUPlace,
float>);
REGISTER_OP_CPU_KERNEL(maxout_grad,
ops::MaxOutGradKernel<paddle::platform::CPUPlace,
float>);
ops::MaxOutOpGrad);
REGISTER_OP_CPU_KERNEL(maxout,
ops::MaxOutKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
maxout_grad, ops::MaxOutGradKernel<paddle::platform::CPUPlace, float>);
8 changes: 3 additions & 5 deletions paddle/operators/maxout_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(maxout,
ops::MaxOutKernel<paddle::platform::GPUPlace, float>,
ops::MaxOutKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(maxout_grad,
ops::MaxOutGradKernel<paddle::platform::GPUPlace,
float>,
ops::MaxOutGradKernel<paddle::platform::GPUPlace,
double>);
REGISTER_OP_GPU_KERNEL(
maxout_grad, ops::MaxOutGradKernel<paddle::platform::GPUPlace, float>,
ops::MaxOutGradKernel<paddle::platform::GPUPlace, double>);
Loading