From 5dabf9c484c0bc5410e3700e3010cdabb4bf903c Mon Sep 17 00:00:00 2001 From: Xiaohan Wei Date: Tue, 15 Feb 2022 11:08:17 -0800 Subject: [PATCH] [caffe2] allow dropout to take 1.0 as dropout ratio to zero-out a layer (#72741) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/72741 as titled. Context: This is useful in fast mitigating feature induced overfitting in the sense that we can do omni-transfer on a trained model and apply dropout with ratio = 1 on features resulting in overfitting. Directly removing the features would not be feasible on omni-transfer scenarios since the downstream FC sizes would change. Experimental records: https://fb.quip.com/npIkAgRc8jl9#temp:C:DWC050ceaba14424d23a78462c01 Doing dropout = 1 on selected features improves the eval NE over the next few hours (compared to v0 baseline) as is shown in the figures. Test Plan: ``` buck test caffe2/caffe2/python/operator_test:dropout_op_test ``` Reviewed By: ustctf Differential Revision: D34178732 fbshipit-source-id: 533feebe21bc582eefd756de397d5c7807c7438d --- caffe2/operators/dropout_op.cc | 5 ++- caffe2/operators/dropout_op.h | 2 -- .../python/operator_test/dropout_op_test.py | 32 +++++++++++++++++++ 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/caffe2/operators/dropout_op.cc b/caffe2/operators/dropout_op.cc index 6f37407bd40eb..bbd1eb1c72c9e 100644 --- a/caffe2/operators/dropout_op.cc +++ b/caffe2/operators/dropout_op.cc @@ -15,13 +15,12 @@ bool DropoutOp::RunOnDevice() { return true; } else { // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) - float scale = 1. / (1. - ratio_); + float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_); // mask=true means keep, and mask=false means not keep, so we will // generate probability depending on 1-ratio. at::bernoulli_distribution dist(1. - ratio_); const float* Xdata = X.data(); float* Ydata = Y->template mutable_data(); - auto mask = Output(1, X.sizes(), at::dtype()); bool* mask_data = mask->template mutable_data(); auto* gen = context_.RandGenerator(); @@ -52,7 +51,7 @@ bool DropoutGradientOp::RunOnDevice() { const bool* mask_data = mask.data(); float* dXdata = dX->template mutable_data(); // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) - float scale = 1. / (1. - ratio_); + float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_); for (int i = 0; i < dY.numel(); ++i) { // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) dXdata[i] = dYdata[i] * mask_data[i] * scale; diff --git a/caffe2/operators/dropout_op.h b/caffe2/operators/dropout_op.h index aff0528c7ffae..ae8f0ff1bba66 100644 --- a/caffe2/operators/dropout_op.h +++ b/caffe2/operators/dropout_op.h @@ -19,7 +19,6 @@ class DropoutOp final : public Operator { is_test_( this->template GetSingleArgument(OpSchema::Arg_IsTest, 0)) { CAFFE_ENFORCE_GE(ratio_, 0); - CAFFE_ENFORCE_LT(ratio_, 1); } bool RunOnDevice() override; @@ -41,7 +40,6 @@ class DropoutGradientOp final : public Operator { is_test_( this->template GetSingleArgument(OpSchema::Arg_IsTest, 0)) { CAFFE_ENFORCE_GE(ratio_, 0); - CAFFE_ENFORCE_LT(ratio_, 1); } bool RunOnDevice() override; diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py index d3a5c831d875d..ad2b6209cf4a7 100644 --- a/caffe2/python/operator_test/dropout_op_test.py +++ b/caffe2/python/operator_test/dropout_op_test.py @@ -74,3 +74,35 @@ def reference_dropout_ratio0(x): gc, op, [X], reference_dropout_ratio0, # Don't check the mask with cuDNN because it's packed data outputs_to_check=None if engine != 'CUDNN' else [0]) + + + @given(X=hu.tensor(), + in_place=st.booleans(), + output_mask=st.booleans(), + engine=st.sampled_from(["", "CUDNN"]), + **hu.gcs) + @settings(deadline=10000) + def test_dropout_ratio1(self, X, in_place, output_mask, engine, gc, dc): + """Test with ratio=0 for a deterministic reference impl.""" + if in_place: + # Skip if trying in-place on GPU + assume(gc.device_type not in {caffe2_pb2.CUDA, caffe2_pb2.HIP}) + # If in-place on CPU, don't compare with GPU + dc = dc[:1] + is_test = not output_mask + op = core.CreateOperator("Dropout", ["X"], + ["X" if in_place else "Y"] + + (["mask"] if output_mask else []), + ratio=1.0, engine=engine, + is_test=is_test) + + self.assertDeviceChecks(dc, op, [X], [0]) + if not is_test: + self.assertGradientChecks(gc, op, [X], 0, [0]) + + def reference_dropout_ratio1(x): + return (x,) if is_test else (np.zeros(x.shape, dtype=np.float), np.zeros(x.shape, dtype=np.bool)) + self.assertReferenceChecks( + gc, op, [X], reference_dropout_ratio1, + # Don't check the mask with cuDNN because it's packed data + outputs_to_check=None if engine != 'CUDNN' else [0])