pytorch
diff --git a/‎setup.py
Lines changed: 14 additions & 0 deletions b/‎setup.py
Lines changed: 14 additions & 0 deletions
diff --git a/‎test/quantization/test_sfdp_int8_fx_pass.py
Lines changed: 5 additions & 5 deletions b/‎test/quantization/test_sfdp_int8_fx_pass.py
Lines changed: 5 additions & 5 deletions
@@ -70,6 +70,20 @@ def get_extensions():
             "cxx": [
                 "-O3" if not debug_mode else "-O0",
                 "-fdiagnostics-color=always",
+                # ## AVX2
+                # "-DCPU_CAPABILITY=AVX2",
+                # "-DCPU_CAPABILITY_AVX2",
+                # "-mavx2",
+                # "-mfma",
+                # "-mf16c",
+                ## AVX512
+                "-DCPU_CAPABILITY=AVX512",
+                "-DCPU_CAPABILITY_AVX512",
+                "-mavx512f",
+                "-mavx512bw",
+                "-mavx512vl",
+                "-mavx512dq",
+                "-mfma",
             ],
             "nvcc": [
                 "-O3" if not debug_mode else "-O0",
 
@@ -16,7 +16,7 @@
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
 
 import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq
-from torch._export import capture_pre_autograd_graph
+from torch.export import export_for_training
 from torch.ao.quantization.quantize_pt2e import convert_pt2e, prepare_pt2e
 from torch.ao.quantization.quantizer.x86_inductor_quantizer import (
     X86InductorQuantizer,
@@ -65,7 +65,7 @@ def forward(self, x, mask):
         if self.has_mask:
             scores = scores + mask
         attention = self.softmax(scores)
-        # attention = self.dropout(attention)
+        attention = self.dropout(attention)
         context_layer = torch.matmul(attention, v)
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         context_layer = context_layer.view(
@@ -75,7 +75,7 @@ def forward(self, x, mask):
 
 def _generate_qdq_quantized_model(mod, inputs, quantizer):
     with torch.no_grad():
-        export_model = capture_pre_autograd_graph(mod, inputs)
+        export_model = export_for_training(mod, inputs).module()
         prepare_model = prepare_pt2e(export_model, quantizer)
         prepare_model(*inputs)
         convert_model = convert_pt2e(prepare_model)
@@ -173,10 +173,10 @@ def _test_sdpa_rewriter_int8_1_to_4(self):
                 if dtype == torch.bfloat16
                 else contextlib.nullcontext()
             )
-            inputs = [
+            inputs = (
                 torch.randn((bs, 384, 64 * 16), device=self.device, dtype=dtype),
                 torch.randn((bs, 1, 1, 384), device=self.device) if has_mask else None,
-            ]
+            )
             with torch.no_grad(), maybe_autocast:
                 _sfdp_init_int8()
                 quantizer = X86InductorQuantizer()