Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 78 additions & 0 deletions paddle/operators/squared_l2_norm_op.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/operators/squared_l2_norm_op.h"

namespace paddle {
namespace operators {

using framework::Tensor;

class SquaredL2NormOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;

void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");

ctx->SetOutputDim("Out", {1});
}
};

class SquaredL2NormGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;

void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@GRAD) should be not null.");

ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
};

class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SquaredL2NormOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker)
: framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor) The input of squared_l2_norm op.");
AddOutput("Out", "(Float) The output of squared_l2_norm op.");
AddComment(R"DOC(
SquaredL2Norm Operator.

Computes the squared L2 norm of a tensor.

Out = sum (X ** 2)

)DOC");
}
};

} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker,
squared_l2_norm_grad, ops::SquaredL2NormGradOp);
REGISTER_OP_CPU_KERNEL(
squared_l2_norm,
ops::SquaredL2NormKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
squared_l2_norm_grad,
ops::SquaredL2NormGradKernel<paddle::platform::CPUPlace, float>);
24 changes: 24 additions & 0 deletions paddle/operators/squared_l2_norm_op.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#define EIGEN_USE_GPU
#include "paddle/operators/squared_l2_norm_op.h"

namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
squared_l2_norm,
ops::SquaredL2NormKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
squared_l2_norm_grad,
ops::SquaredL2NormGradKernel<paddle::platform::GPUPlace, float>);
64 changes: 64 additions & 0 deletions paddle/operators/squared_l2_norm_op.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"

namespace paddle {
namespace operators {

// Out = sum(square(X))
template <typename Place, typename T>
class SquaredL2NormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
const framework::Tensor *X = context.Input<framework::Tensor>("X");
framework::Tensor *Out = context.Output<framework::Tensor>("Out");
Out->mutable_data<T>(context.GetPlace());

auto x = framework::EigenVector<T>::Flatten(*X);
auto out = framework::EigenVector<T>::Flatten(*Out);
auto place = context.GetEigenDevice<Place>();

out.device(place) = x.square().sum();
Copy link
Contributor

@qingqing01 qingqing01 Oct 24, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. The SquaredL2Norm is good and clear name, the square of L2 norm.

  2. But if for L2 regularization, usually the overall cost doest not contain the regularization term (also called a weight decay term) in most framework, including the old PaddlePaddle framework. So there is no need to use this op in the forward. Although, the cost function (as follows) contains this term.

    see the formula in this link :
    http://ufldl.stanford.edu/wiki/index.php/Backpropagation_Algorithm

    And in the backward backpropagation, the derivative of the overall cost function J(w, b) is:

    see the formula in this link : http://ufldl.stanford.edu/wiki/index.php/Backpropagation_Algorithm

    the weight decay term will be a linear operation on W. (Discussed with @lcy-seso serveral days ago). So only needs a scale op for the L2 Regularization in the parameter updating process. Also can see the momentum updating formula in the paper, the weight decay is a linear operation on W.

So I'm not sure whether this op is needed and whether there is another use scenarios.

Copy link
Collaborator

@reyoung reyoung Oct 24, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Regularization Loss is a term of the overall loss equation. If we want to plot an overall loss graph along every iteration for a classification task, the loss should equal classification loss + regularization loss.

  2. weight decay just fits L2-norm regularization. We try to implement a common way for regularization, which fits both L1 and L2.

  3. We will provide a weight decay in the future PRs. It is not a conflict that we provide both weight decay and L2 Norm operator.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@qingqing01 @lcy-seso Both these regularization techniques have their advantages and disadvantages. Let me summarize them as follows:

Weight Decay in Optimizer Separate Operators for regularization
No Forward prop op, hence faster There will be a forward prop op
Will not support making plots of total loss vs epoch/iteration Will support those plots
This does not generalize well beyond L2 and L1 regularization. Use of Batch norm and layer norm might invalidate this approach This is a very general approach and can be applied to any kind of network
It is not easy for researchers to add new regularizers in this framework because the regularization is tightly coupled with optimizers. They might have to change all optimizers. Adding new regularization schemes is easy as the code for regularization is independent of optimization.
Frameworks that support this: Pytorch, Caffe Frameworks that support this: Tensorflow, Theano, Lasagne

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that forward prop op might be necessary because while training the model, it is a very common practice to check convergence by making plots of the total loss function vs time. Also during inference, an intelligent executor can easily prune out the graph to remove the regularization nodes.

Also I agree with @reyoung that we can also implement the weight decay separately. That can be only for the case of the L2 and L1 Penalty loss.

Please let me know what you think about this plan?

Copy link
Contributor

@qingqing01 qingqing01 Oct 25, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • For the plotting
    In the old PaddlePaddle framework and Caffe, I think we usually plot the loss without regularization term vs time.

  • For the regularization
    I always agree to separate operators for regularization, not implementing in the optimizer. But I think whether to add the regularization loss to the overall loss depends on the users, not by default. Since it all add more calculation during the training. And the default regularization is only to use L2LrRegularizer (a linear operator)/ L1LrRegularizer operators like the implementation in https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Regularizer.h for the parameters updating.

  • About this PR
    I agree to merge this Op. But whether to use it in the forward and add the regularization loss to the overall loss depends on the users, not default by the framework, if using the regularization in the training.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@qingqing01 @lcy-seso Thank you for going through the PR. I discussed this today with @reyoung and we feel that your point is valid. We can add these ops but whether to use them in forward and add to the loss, will be the user's choice. In another PR, I will add separate operators for regularization which will be used only in the backward pass and not implemented in the optimizer.
In case of L2, this will be a simple scale op and in the case of L1 regularization this will be a combination of scale and sign op.
Thank you so much for your feedback on this.

}
};

// dX = X
template <typename Place, typename T>
class SquaredL2NormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
const framework::Tensor *X = context.Input<framework::Tensor>("X");
const framework::Tensor *dOut =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
PADDLE_ENFORCE(dOut->numel() == 1,
"Squared L2 Norm Gradient should be scalar");
framework::Tensor *dX =
context.Output<framework::Tensor>(framework::GradVarName("X"));
dX->mutable_data<T>(context.GetPlace());

auto x = framework::EigenVector<T>::Flatten(*X);
auto dout = framework::EigenVector<T>::Flatten(*dOut);
auto dx = framework::EigenVector<T>::Flatten(*dX);
auto place = context.GetEigenDevice<Place>();

Eigen::DSizes<int, 1> x_dsize(X->numel());
dx.device(place) = (dout.broadcast(x_dsize) * x) * static_cast<T>(2.0);
}
};

} // namespace operators
} // namespace paddle
29 changes: 29 additions & 0 deletions python/paddle/v2/framework/tests/test_squared_l2_norm_op.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import numpy as np
import unittest
from numpy import linalg as LA
from op_test import OpTest


class TestL2LossOp(OpTest):
"""Test squared_l2_norm
"""

def setUp(self):
self.op_type = "squared_l2_norm"
self.max_relative_error = 0.05

X = np.random.uniform(-1, 1, (13, 19)).astype("float32")
X[np.abs(X) < self.max_relative_error] = 0.1
self.inputs = {'X': X}
self.outputs = {'Out': np.square(LA.norm(X))}

def test_check_output(self):
self.check_output()

def test_check_grad(self):
self.check_grad(
['X'], 'Out', max_relative_error=self.max_relative_error)


if __name__ == "__main__":
unittest.main()