[caffe2] allow dropout to take 1.0 as dropout ratio to zero-out a lay…

…er (pytorch#72741) Summary: Pull Request resolved: pytorch#72741 as titled. Context: This is useful in fast mitigating feature induced overfitting in the sense that we can do omni-transfer on a trained model and apply dropout with ratio = 1 on features resulting in overfitting. Directly removing the features would not be feasible on omni-transfer scenarios since the downstream FC sizes would change. Experimental records: https://fb.quip.com/npIkAgRc8jl9#temp:C:DWC050ceaba14424d23a78462c01 Doing dropout = 1 on selected features improves the eval NE over the next few hours (compared to v0 baseline) as is shown in the figures. Test Plan: ``` buck test caffe2/caffe2/python/operator_test:dropout_op_test ``` Reviewed By: ustctf Differential Revision: D34178732 fbshipit-source-id: 533feebe21bc582eefd756de397d5c7807c7438d (cherry picked from commit 5dabf9c)
yanbing-j · Feb 15, 2022 · ca0ac3a · ca0ac3a
1 parent a7cac05
commit ca0ac3a
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 5 deletions.
diff --git a/caffe2/operators/dropout_op.cc b/caffe2/operators/dropout_op.cc
@@ -15,13 +15,12 @@ bool DropoutOp<float, CPUContext>::RunOnDevice() {
     return true;
   } else {
     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-    float scale = 1. / (1. - ratio_);
+    float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_);
     // mask=true means keep, and mask=false means not keep, so we will
     // generate probability depending on 1-ratio.
     at::bernoulli_distribution<double> dist(1. - ratio_);
     const float* Xdata = X.data<float>();
     float* Ydata = Y->template mutable_data<float>();
-
     auto mask = Output(1, X.sizes(), at::dtype<bool>());
     bool* mask_data = mask->template mutable_data<bool>();
     auto* gen = context_.RandGenerator();
@@ -52,7 +51,7 @@ bool DropoutGradientOp<float, CPUContext>::RunOnDevice() {
     const bool* mask_data = mask.data<bool>();
     float* dXdata = dX->template mutable_data<float>();
     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-    float scale = 1. / (1. - ratio_);
+    float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_);
     for (int i = 0; i < dY.numel(); ++i) {
       // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
       dXdata[i] = dYdata[i] * mask_data[i] * scale;

diff --git a/caffe2/operators/dropout_op.h b/caffe2/operators/dropout_op.h
@@ -19,7 +19,6 @@ class DropoutOp final : public Operator<Context> {
         is_test_(
             this->template GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
     CAFFE_ENFORCE_GE(ratio_, 0);
-    CAFFE_ENFORCE_LT(ratio_, 1);
   }
 
   bool RunOnDevice() override;
@@ -41,7 +40,6 @@ class DropoutGradientOp final : public Operator<Context> {
         is_test_(
             this->template GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
     CAFFE_ENFORCE_GE(ratio_, 0);
-    CAFFE_ENFORCE_LT(ratio_, 1);
   }
 
   bool RunOnDevice() override;

diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py
@@ -74,3 +74,35 @@ def reference_dropout_ratio0(x):
             gc, op, [X], reference_dropout_ratio0,
             # Don't check the mask with cuDNN because it's packed data
             outputs_to_check=None if engine != 'CUDNN' else [0])
+
+
+    @given(X=hu.tensor(),
+           in_place=st.booleans(),
+           output_mask=st.booleans(),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    @settings(deadline=10000)
+    def test_dropout_ratio1(self, X, in_place, output_mask, engine, gc, dc):
+        """Test with ratio=0 for a deterministic reference impl."""
+        if in_place:
+            # Skip if trying in-place on GPU
+            assume(gc.device_type not in {caffe2_pb2.CUDA, caffe2_pb2.HIP})
+            # If in-place on CPU, don't compare with GPU
+            dc = dc[:1]
+        is_test = not output_mask
+        op = core.CreateOperator("Dropout", ["X"],
+                                 ["X" if in_place else "Y"] +
+                                 (["mask"] if output_mask else []),
+                                 ratio=1.0, engine=engine,
+                                 is_test=is_test)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        if not is_test:
+            self.assertGradientChecks(gc, op, [X], 0, [0])
+
+        def reference_dropout_ratio1(x):
+            return (x,) if is_test else (np.zeros(x.shape, dtype=np.float), np.zeros(x.shape, dtype=np.bool))
+        self.assertReferenceChecks(
+            gc, op, [X], reference_dropout_ratio1,
+            # Don't check the mask with cuDNN because it's packed data
+            outputs_to_check=None if engine != 'CUDNN' else [0])