pytorch
diff --git a/‎fbgemm_gpu/codegen/genscript/generate_backward_split.py
Lines changed: 23 additions & 0 deletions b/‎fbgemm_gpu/codegen/genscript/generate_backward_split.py
Lines changed: 23 additions & 0 deletions
diff --git a/‎fbgemm_gpu/codegen/genscript/generate_forward_split.py
Lines changed: 10 additions & 0 deletions b/‎fbgemm_gpu/codegen/genscript/generate_forward_split.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎fbgemm_gpu/codegen/genscript/jinja_environment.py
Lines changed: 91 additions & 1 deletion b/‎fbgemm_gpu/codegen/genscript/jinja_environment.py
Lines changed: 91 additions & 1 deletion
diff --git a/‎fbgemm_gpu/codegen/genscript/optimizers.py
Lines changed: 3 additions & 2 deletions b/‎fbgemm_gpu/codegen/genscript/optimizers.py
Lines changed: 3 additions & 2 deletions
@@ -35,10 +35,13 @@ def render_backward_templates(
         optimizer: str,
         filename_format: str,
         kwargs: Dict[str, Any],
+        is_gwd: bool = False,
     ) -> None:
         if not kwargs.get("has_gpu_support"):
             return
         vbe_options = [True, False] if kwargs.get("has_vbe_support") else [False]
+        if is_gwd:
+            vbe_options = [False]
         template = CodeTemplate.load(template_filepath)
 
         for weighted in [True, False]:
@@ -56,6 +59,7 @@ def render_backward_templates(
                             is_index_select=False,
                             kdesc=wdesc,
                             **kwargs,
+                            is_gwd=is_gwd,
                         )
 
     @staticmethod
@@ -90,6 +94,25 @@ def generate_backward_split_gpu(**kwargs: Any) -> None:
                 filename_format,
                 kwargs,
             )
+        # Generate the backward split kernels
+        if kwargs.get("has_global_weight_decay_support"):
+            for template_filepath, filename_format in [
+                (
+                    "training/backward/embedding_backward_split_kernel_cta_template.cu",
+                    "gen_embedding_backward_{}_split_{}_gwd_kernel_cta.cu",
+                ),
+                (
+                    "training/backward/embedding_backward_split_kernel_warp_template.cu",
+                    "gen_embedding_backward_{}_split_{}_gwd_kernel_warp.cu",
+                ),
+            ]:
+                BackwardSplitGenerator.render_backward_templates(
+                    template_filepath,
+                    optimizer,
+                    filename_format,
+                    kwargs,
+                    is_gwd=True,
+                )
 
         # Generate optimizer kernel
         CodeTemplate.load(
 
@@ -26,6 +26,7 @@ def render_forward_templates(
         dense_options: List[bool],
         nobag_options: List[bool],
         vbe_options: List[bool],
+        is_gwd: bool = False,
     ) -> None:
         template = CodeTemplate.load(template_filepath)
         for dense in dense_options:
@@ -51,6 +52,7 @@ def render_forward_templates(
                                 nobag=nobag,
                                 vbe=vbe,
                                 is_index_select=False,
+                                is_gwd=is_gwd,
                             )
 
     @staticmethod
@@ -116,6 +118,14 @@ def generate_kernels() -> None:
             nobag_options=[True, False],
             vbe_options=[True, False],
         )
+        ForwardSplitGenerator.render_forward_templates(
+            "training/forward/embedding_forward_split_kernel_template.cu",
+            "gen_embedding_forward_{}_gwd_kernel.cu",
+            dense_options=[False],
+            nobag_options=[False],
+            vbe_options=[False],
+            is_gwd=True,
+        )
 
         # Generate the v2 CUDA kernels
         ForwardSplitGenerator.render_forward_templates(
 
@@ -294,6 +294,92 @@ def has_experimental_support(
     return not dense and not nobag and not vbe and not is_index_select and not is_rocm
 
 
+def is_valid_forward_gwd_config(
+    dense: bool, nobag: bool, vbe: bool, is_index_select: bool, is_rocm: bool
+) -> bool:
+    """
+    Check if the given combination of configs for `forward` has global weight decay support
+    - global weight decay does not support dense, nobag, vbe, is_index_select, and is_rocm
+    """
+    return not dense and not nobag and not vbe and not is_index_select and not is_rocm
+
+
+def is_valid_backward_gwd_config(
+    dense: bool,
+    nobag: bool,
+    vbe: bool,
+    is_index_select: bool,
+    is_rocm: bool,
+    has_global_weight_decay_support: bool,
+) -> bool:
+    """
+    Check if the given combination of configs for `backward` has global weight decay support
+    - `has_global_weight_decay_support` is whether global weight decay is available for
+    an optimizer, but not all configs of such optimizer offer global weight decay support
+    - global weight decay does not support dense, nobag, vbe, is_index_select, and is_rocm
+    """
+    return (
+        not dense
+        and not nobag
+        and not vbe
+        and not is_index_select
+        and not is_rocm
+        and has_global_weight_decay_support
+    )
+
+
+def update_with_global_weight_decay(has_global_weight_decay_support: bool) -> str:
+    """
+    Update weights with global weight decay value if has_global_weight_decay_support
+    """
+    if has_global_weight_decay_support:
+        return """
+        weight_new.mul_(global_weight_decay);
+        """
+    else:
+        return ""
+
+
+def compute_global_weight_decay(is_global_weight_decay_kernel: bool) -> str:
+    """
+    For global weight decay kernel, compute the global weight decay value
+    and update prev_iter to be current iteration
+    This is to used in both warp and cta kernels.
+    """
+    if is_global_weight_decay_kernel:
+        return """
+        const auto global_weight_decay = std::pow(weight_decay_base, iter - prev_iter_dev_gwd[linear_index] - 1);
+        if (threadIdx.x == 0) {
+            prev_iter_dev_gwd[linear_index] = iter;
+        }
+        """
+    else:
+        return ""
+
+
+def pass_gwd_to_update_table(
+    is_global_weight_decay_kernel: bool,
+    has_global_weight_decay_support: bool,
+) -> str:
+    """
+    Pass correct parameter to the update_table_kernel
+    - pass global weight decay when enabled and computed
+    - pass 1.0 when not enabled
+    - pass nothing for other configs
+    This is to used in both warp and cta kernels.
+    """
+    if is_global_weight_decay_kernel:
+        # global weight decay is enabled
+        return "global_weight_decay,"
+    elif has_global_weight_decay_support:
+        # table update kernel is per optimizer (e.g., rowwise adagrad).
+        # But not all configs have gwd enabled or gwd support
+        # (e.g., other modes, nobag kernels)
+        return "1.0,"
+    else:
+        return ""
+
+
 ################################################################################
 # Register Helper Functions in Jinja Environment
 ################################################################################
@@ -307,7 +393,11 @@ def has_experimental_support(
 env.globals["dispatch_vec_blocking_kernel"] = dispatch_vec_blocking_kernel
 env.globals["is_valid_forward_config"] = is_valid_forward_config
 env.globals["has_experimental_support"] = has_experimental_support
-
+env.globals["is_valid_forward_gwd_config"] = is_valid_forward_gwd_config
+env.globals["is_valid_backward_gwd_config"] = is_valid_backward_gwd_config
+env.globals["compute_global_weight_decay"] = compute_global_weight_decay
+env.globals["update_with_global_weight_decay"] = update_with_global_weight_decay
+env.globals["pass_gwd_to_update_table"] = pass_gwd_to_update_table
 
 ################################################################################
 # Filter functions in Jinja Environment
 
@@ -197,7 +197,7 @@ def rowwise_adagrad() -> Dict[str, Any]:
         if (weight_decay_mode == 1) {
             // L2 regularization
             correction = 1.0 - multiplier * weight_decay;
-        } else if (weight_decay_mode == 2) {
+        } else if (weight_decay_mode == 2 || weight_decay_mode == 5) {
             // Decoupled weight decay
             correction = 1.0 - learning_rate * weight_decay;
         } else {
@@ -227,7 +227,7 @@ def rowwise_adagrad() -> Dict[str, Any]:
         if (weight_decay_mode == 1) {
             // L2 regularization
             correction = 1.0 - multiplier * weight_decay;
-        } else if (weight_decay_mode == 2) {
+        } else if (weight_decay_mode == 2 || weight_decay_mode == 5) {
             // Decoupled weight decay
             correction = 1.0 - learning_rate * weight_decay;
         } else {
@@ -258,6 +258,7 @@ def rowwise_adagrad() -> Dict[str, Any]:
         "has_cpu_support": True,
         "has_gpu_support": True,
         "has_vbe_support": True,
+        "has_global_weight_decay_support": True,
     }