-
Notifications
You must be signed in to change notification settings - Fork 5.9k
Adding the squared L2 norm operator for L2 regularization #5030
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
e45d5b0
Adding the L2 loss operator for L2 regularization
88a09e0
Renaming l2_loss op to squared_l2_norm_op
a5958bc
Merge remote-tracking branch 'origin/develop' into l2_loss
df79d30
Merge remote-tracking branch 'origin/develop' into l2_loss
1ff3d8d
Addressing code review feedback
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,78 @@ | ||
| /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. | ||
|
|
||
| Licensed under the Apache License, Version 2.0 (the "License"); | ||
| you may not use this file except in compliance with the License. | ||
| You may obtain a copy of the License at | ||
|
|
||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
|
|
||
| Unless required by applicable law or agreed to in writing, software | ||
| distributed under the License is distributed on an "AS IS" BASIS, | ||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| See the License for the specific language governing permissions and | ||
| limitations under the License. */ | ||
|
|
||
| #include "paddle/operators/squared_l2_norm_op.h" | ||
|
|
||
| namespace paddle { | ||
| namespace operators { | ||
|
|
||
| using framework::Tensor; | ||
|
|
||
| class SquaredL2NormOp : public framework::OperatorWithKernel { | ||
| public: | ||
| using framework::OperatorWithKernel::OperatorWithKernel; | ||
|
|
||
| void InferShape(framework::InferShapeContext* ctx) const override { | ||
| PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); | ||
| PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null."); | ||
|
|
||
| ctx->SetOutputDim("Out", {1}); | ||
| } | ||
| }; | ||
|
|
||
| class SquaredL2NormGradOp : public framework::OperatorWithKernel { | ||
| public: | ||
| using framework::OperatorWithKernel::OperatorWithKernel; | ||
|
|
||
| void InferShape(framework::InferShapeContext* ctx) const override { | ||
| PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); | ||
| PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), | ||
| "Input(Out@GRAD) should be not null."); | ||
| PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), | ||
| "Output(X@GRAD) should be not null."); | ||
|
|
||
| ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); | ||
| } | ||
| }; | ||
|
|
||
| class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker { | ||
| public: | ||
| SquaredL2NormOpMaker(framework::OpProto* proto, | ||
| framework::OpAttrChecker* op_checker) | ||
| : framework::OpProtoAndCheckerMaker(proto, op_checker) { | ||
| AddInput("X", "(Tensor) The input of squared_l2_norm op."); | ||
| AddOutput("Out", "(Float) The output of squared_l2_norm op."); | ||
| AddComment(R"DOC( | ||
| SquaredL2Norm Operator. | ||
|
|
||
| Computes the squared L2 norm of a tensor. | ||
|
|
||
| Out = sum (X ** 2) | ||
|
|
||
| )DOC"); | ||
| } | ||
| }; | ||
|
|
||
| } // namespace operators | ||
| } // namespace paddle | ||
|
|
||
| namespace ops = paddle::operators; | ||
| REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker, | ||
| squared_l2_norm_grad, ops::SquaredL2NormGradOp); | ||
| REGISTER_OP_CPU_KERNEL( | ||
| squared_l2_norm, | ||
| ops::SquaredL2NormKernel<paddle::platform::CPUPlace, float>); | ||
| REGISTER_OP_CPU_KERNEL( | ||
| squared_l2_norm_grad, | ||
| ops::SquaredL2NormGradKernel<paddle::platform::CPUPlace, float>); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. | ||
|
|
||
| Licensed under the Apache License, Version 2.0 (the "License"); | ||
| you may not use this file except in compliance with the License. | ||
| You may obtain a copy of the License at | ||
|
|
||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
|
|
||
| Unless required by applicable law or agreed to in writing, software | ||
| distributed under the License is distributed on an "AS IS" BASIS, | ||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| See the License for the specific language governing permissions and | ||
| limitations under the License. */ | ||
|
|
||
| #define EIGEN_USE_GPU | ||
| #include "paddle/operators/squared_l2_norm_op.h" | ||
|
|
||
| namespace ops = paddle::operators; | ||
| REGISTER_OP_GPU_KERNEL( | ||
| squared_l2_norm, | ||
| ops::SquaredL2NormKernel<paddle::platform::GPUPlace, float>); | ||
| REGISTER_OP_GPU_KERNEL( | ||
| squared_l2_norm_grad, | ||
| ops::SquaredL2NormGradKernel<paddle::platform::GPUPlace, float>); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. | ||
|
|
||
| Licensed under the Apache License, Version 2.0 (the "License"); | ||
| you may not use this file except in compliance with the License. | ||
| You may obtain a copy of the License at | ||
|
|
||
| http://www.apache.org/licenses/LICENSE-2.0 | ||
|
|
||
| Unless required by applicable law or agreed to in writing, software | ||
| distributed under the License is distributed on an "AS IS" BASIS, | ||
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| See the License for the specific language governing permissions and | ||
| limitations under the License. */ | ||
|
|
||
| #pragma once | ||
| #include "paddle/framework/eigen.h" | ||
| #include "paddle/framework/op_registry.h" | ||
|
|
||
| namespace paddle { | ||
| namespace operators { | ||
|
|
||
| // Out = sum(square(X)) | ||
| template <typename Place, typename T> | ||
| class SquaredL2NormKernel : public framework::OpKernel<T> { | ||
| public: | ||
| void Compute(const framework::ExecutionContext &context) const override { | ||
| const framework::Tensor *X = context.Input<framework::Tensor>("X"); | ||
| framework::Tensor *Out = context.Output<framework::Tensor>("Out"); | ||
| Out->mutable_data<T>(context.GetPlace()); | ||
|
|
||
| auto x = framework::EigenVector<T>::Flatten(*X); | ||
| auto out = framework::EigenVector<T>::Flatten(*Out); | ||
| auto place = context.GetEigenDevice<Place>(); | ||
|
|
||
| out.device(place) = x.square().sum(); | ||
| } | ||
| }; | ||
|
|
||
| // dX = X | ||
| template <typename Place, typename T> | ||
| class SquaredL2NormGradKernel : public framework::OpKernel<T> { | ||
| public: | ||
| void Compute(const framework::ExecutionContext &context) const override { | ||
| const framework::Tensor *X = context.Input<framework::Tensor>("X"); | ||
| const framework::Tensor *dOut = | ||
| context.Input<framework::Tensor>(framework::GradVarName("Out")); | ||
| PADDLE_ENFORCE(dOut->numel() == 1, | ||
| "Squared L2 Norm Gradient should be scalar"); | ||
| framework::Tensor *dX = | ||
| context.Output<framework::Tensor>(framework::GradVarName("X")); | ||
| dX->mutable_data<T>(context.GetPlace()); | ||
|
|
||
| auto x = framework::EigenVector<T>::Flatten(*X); | ||
| auto dout = framework::EigenVector<T>::Flatten(*dOut); | ||
| auto dx = framework::EigenVector<T>::Flatten(*dX); | ||
| auto place = context.GetEigenDevice<Place>(); | ||
|
|
||
| Eigen::DSizes<int, 1> x_dsize(X->numel()); | ||
| dx.device(place) = (dout.broadcast(x_dsize) * x) * static_cast<T>(2.0); | ||
| } | ||
| }; | ||
|
|
||
| } // namespace operators | ||
| } // namespace paddle | ||
29 changes: 29 additions & 0 deletions
29
python/paddle/v2/framework/tests/test_squared_l2_norm_op.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| import numpy as np | ||
| import unittest | ||
| from numpy import linalg as LA | ||
| from op_test import OpTest | ||
|
|
||
|
|
||
| class TestL2LossOp(OpTest): | ||
| """Test squared_l2_norm | ||
| """ | ||
|
|
||
| def setUp(self): | ||
| self.op_type = "squared_l2_norm" | ||
| self.max_relative_error = 0.05 | ||
|
|
||
| X = np.random.uniform(-1, 1, (13, 19)).astype("float32") | ||
| X[np.abs(X) < self.max_relative_error] = 0.1 | ||
| self.inputs = {'X': X} | ||
| self.outputs = {'Out': np.square(LA.norm(X))} | ||
|
|
||
| def test_check_output(self): | ||
| self.check_output() | ||
|
|
||
| def test_check_grad(self): | ||
| self.check_grad( | ||
| ['X'], 'Out', max_relative_error=self.max_relative_error) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| unittest.main() |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
SquaredL2Normis good and clear name, the square of L2 norm.But if for L2 regularization, usually the overall cost doest not contain the regularization term (also called a weight decay term) in most framework, including the old PaddlePaddle framework. So there is no need to use this op in the forward. Although, the cost function (as follows) contains this term.
see the formula in this link :
http://ufldl.stanford.edu/wiki/index.php/Backpropagation_Algorithm
And in the backward backpropagation, the derivative of the overall cost function J(w, b) is:
see the formula in this link : http://ufldl.stanford.edu/wiki/index.php/Backpropagation_Algorithm
the weight decay term will be a linear operation on W. (Discussed with @lcy-seso serveral days ago). So only needs a scale op for the L2 Regularization in the parameter updating process. Also can see the momentum updating formula in the paper, the weight decay is a linear operation on W.
So I'm not sure whether this op is needed and whether there is another use scenarios.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Regularization Loss is a term of the overall loss equation. If we want to plot an overall loss graph along every iteration for a classification task, the loss should equal
classification loss + regularization loss.weight decayjust fitsL2-norm regularization. We try to implement a common way for regularization, which fits both L1 and L2.We will provide a
weight decayin the future PRs. It is not a conflict that we provide bothweight decayandL2 Normoperator.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@qingqing01 @lcy-seso Both these regularization techniques have their advantages and disadvantages. Let me summarize them as follows:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that forward prop op might be necessary because while training the model, it is a very common practice to check convergence by making plots of the total loss function vs time. Also during inference, an intelligent executor can easily prune out the graph to remove the regularization nodes.
Also I agree with @reyoung that we can also implement the weight decay separately. That can be only for the case of the L2 and L1 Penalty loss.
Please let me know what you think about this plan?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For the plotting
In the old PaddlePaddle framework and Caffe, I think we usually plot the loss without regularization term vs time.
For the regularization
I always agree to separate operators for regularization, not implementing in the optimizer. But I think whether to add the regularization loss to the overall loss depends on the users, not by default. Since it all add more calculation during the training. And the default regularization is only to use
L2LrRegularizer(a linear operator)/L1LrRegularizeroperators like the implementation in https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Regularizer.h for the parameters updating.About this PR
I agree to merge this Op. But whether to use it in the forward and add the regularization loss to the overall loss depends on the users, not default by the framework, if using the regularization in the training.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@qingqing01 @lcy-seso Thank you for going through the PR. I discussed this today with @reyoung and we feel that your point is valid. We can add these ops but whether to use them in forward and add to the loss, will be the user's choice. In another PR, I will add separate operators for regularization which will be used only in the backward pass and not implemented in the optimizer.
In case of L2, this will be a simple scale op and in the case of L1 regularization this will be a combination of scale and sign op.
Thank you so much for your feedback on this.