Add quantization/legalization for stablehlo.add and respective pipeline changes.

jenriver · tensorflower-gardener · commit 9926a407d092 · 2024-03-28T16:21:42.000-07:00
* Added `enable_full_int_quantization` in `StaticRangePtqPreset` to determine full int quantization. This value will be `false` by default, meaning only compute-heavy ops will be quantized unless specified.
* Added tests for the above config change.
* Follow up tests will include e2e python tests.

PiperOrigin-RevId: 620067140
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
@@ -98,10 +98,11 @@ void PopulateDefaultCalibrationOptions(QuantizationConfig& quant_config) {
 //   {matcher {function_name {regex: ".*"}}
 //   {method {static_range_ptq {}}}
 // }
-QuantizationSpec GetDefaultStaticRangePtqSpec() {
+QuantizationSpec GetDefaultStaticRangePtqSpec(StaticRangePtqPreset preset) {
   QuantizationSpec spec{};
   // Default for all ops.
-  spec.mutable_matcher()->mutable_function_name()->set_regex(".*");
+  spec.mutable_matcher()->mutable_function_name()->set_regex(
+      preset.enable_full_int_quantization() ? ".*" : "^.*(conv|dot|gather).*");
   spec.mutable_method()->mutable_static_range_ptq();
 
   return spec;
@@ -161,7 +162,8 @@ void ExpandStaticRangePtqPreset(const StaticRangePtqPreset& preset,
   // expansion from `StaticRangePtqPreset` gets populated first and then
   // user-provided explicit `QuantizationSpec`s will be appended.
   QuantizationSpecs new_specs{};
-  *new_specs.add_specs() = GetDefaultStaticRangePtqSpec();
+  *new_specs.add_specs() =
+      GetDefaultStaticRangePtqSpec(/*preset=*/config.static_range_ptq_preset());
   *new_specs.add_specs() = GetStaticRangePtqSpecForConvolution();
 
   // Append user-provided specs to override existing specs.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config_test.cc
@@ -147,10 +147,12 @@ TEST(ExpandPresetsTest, ExpandUnspecifiedPreset) {
   EXPECT_FALSE(new_config.has_pipeline_config());
 }
 
-TEST(ExpandPresetsTest, ExpandStaticRangePtqPreset) {
+TEST(ExpandPresetsTest, ExpandStaticRangePtqEnableFullIntquantization) {
   QuantizationConfig config{};
   RepresentativeDatasetConfig& preset_dataset_config =
       *config.mutable_static_range_ptq_preset()->add_representative_datasets();
+  config.mutable_static_range_ptq_preset()->set_enable_full_int_quantization(
+      true);
   preset_dataset_config.mutable_tf_record()->set_path("/test/path");
 
   const QuantizationConfig new_config = ExpandPresets(config);
@@ -185,6 +187,21 @@ TEST(ExpandPresetsTest, ExpandStaticRangePtqPreset) {
               StrEq("/test/path"));
 }
 
+TEST(ExpandPresetsTest, ExpandStaticRangePtqPresetDefault) {
+  QuantizationConfig config{};
+  RepresentativeDatasetConfig& preset_dataset_config =
+      *config.mutable_static_range_ptq_preset()->add_representative_datasets();
+  preset_dataset_config.mutable_tf_record()->set_path("/test/path");
+
+  const QuantizationConfig new_config = ExpandPresets(config);
+  ASSERT_THAT(new_config.specs().specs(), SizeIs(2));
+
+  const QuantizationSpec& spec = new_config.specs().specs(0);
+  EXPECT_THAT(spec.matcher().function_name().regex(),
+              StrEq("^.*(conv|dot|gather).*"));
+  EXPECT_TRUE(spec.method().has_static_range_ptq());
+}
+
 TEST(ExpandPresetsTest,
      ExpandStaticRangePtqPresetWithTopLevelRepresentativeDataset) {
   // Test the scenario where both
@@ -216,7 +233,8 @@ TEST(ExpandPresetsTest,
 
 TEST(ExpandPresetsTest, ExpandStaticRangePtqPresetThenAppendExplicitSpecs) {
   QuantizationConfig config{};
-  config.mutable_static_range_ptq_preset();
+  config.mutable_static_range_ptq_preset()->set_enable_full_int_quantization(
+      true);
 
   QuantizationSpec& user_provided_spec = *config.mutable_specs()->add_specs();
   user_provided_spec.mutable_matcher()->mutable_function_name()->set_regex(
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
@@ -58,8 +58,11 @@ void AddPostCalibrationPasses(
     OpPassManager& pm, const PipelineConfig& pipeline_config,
     const StaticRangePtqPreset& static_range_ptq_preset) {
   QuantizeCompositeFunctionsPassOptions options;
+  // TODO: b/331120943 - Use QuantizationConfig instead of preset flags.
   options.enable_per_channel_quantized_weight_ =
       static_range_ptq_preset.enable_per_channel_quantized_weight();
+  options.enable_full_int_quantization_ =
+      static_range_ptq_preset.enable_full_int_quantization();
   // For debugging purposes.
   options.mlir_dump_file_name_ = "quantize_composite_functions";
   options.enable_weight_only_ = false;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_simple.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_simple.td
@@ -67,3 +67,11 @@ def LiftGather : Pat<
       (NamedAttr<"slice_sizes"> $slice_sizes),
       (NamedAttr<"indices_are_sorted"> (DefaultOrNullAttr $indices_are_sorted)))),
   [(IsNotInLiftedFunc $res), (IsStableHLOConstantOp $operand)], [], (addBenefit 1)>;
+
+def LiftAdd : Pat<
+  (StableHLO_AddOp:$res
+      $lhs, $rhs),
+  (LiftAsTFXlaCallModule<"composite_add_fn">
+    (ArgumentList $lhs, $rhs),
+    (ResultList $res)),
+  [(IsNotInLiftedFunc $res), (IsNotInStableHloOpRegion $res)], [], (addBenefit 1)>;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
@@ -60,6 +60,10 @@ def QuantizeCompositeFunctionsPass : Pass<"stablehlo-quantize-composite-function
         "enable-per-channel-quantized-weight",
         "bool", /*default=*/"true",
         "Whether to enable per-channel quantized weights.">,
+    Option<"enable_full_int_quantization_",
+        "enable-full-int-quantization",
+        "bool", /*default=*/"false",
+        "Whether to enable full int quantization, including non compute-heavy ops.">,
     Option<"mlir_dump_file_name_", "mlir-dump-file-name",
         "std::optional<std::string>", /*default=*/"std::nullopt",
         "MLIR dump file name.">,
@@ -102,6 +106,10 @@ def QuantizePass : Pass<"stablehlo-quantize", "mlir::ModuleOp"> {
         "enable-per-channel-quantized-weight",
         "bool", /*default=*/"true",
         "Whether to enable per-channel quantized weights.">,
+    Option<"enable_full_int_quantization_",
+      "enable-full-int-quantization",
+      "bool", /*default=*/"false",
+      "Whether to apply full int quantization, including non compute-heavy ops.">,
     Option<"enable_weight_only_",
         "enable-weight-only",
         "bool", /*default=*/"false",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
@@ -954,6 +954,12 @@ void PopulateComputeHeavyPatterns(
   patterns.add<QuantizeOpWithRegionPattern>(ctx);
 }
 
+void PopulateAllQuantizablePatterns(MLIRContext& ctx,
+                                    RewritePatternSet& patterns) {
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeSingularOpPattern<AddOp>>>(
+      ctx, /*enable_per_channel_quantized_weight=*/false);
+}
+
 void PopulateQuantizeWeightOnlyPatterns(MLIRContext& ctx,
                                         RewritePatternSet& patterns) {
   patterns.add<
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
@@ -254,6 +254,11 @@ class StableHloQuantizationPattern : public OpRewritePattern<RootOpT> {
 void PopulateComputeHeavyPatterns(MLIRContext& ctx, RewritePatternSet& patterns,
                                   bool enable_per_channel_quantized_weight);
 
+// Populates conversion patterns for all quantizable ops, including
+// ops that are not compute-heavy and data movement ops.
+void PopulateAllQuantizablePatterns(MLIRContext& ctx,
+                                    RewritePatternSet& patterns);
+
 // Populates pattern weight-only quantization.
 void PopulateQuantizeWeightOnlyPatterns(MLIRContext& ctx,
                                         RewritePatternSet& patterns);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
@@ -96,9 +96,11 @@ class QuantizePass : public impl::QuantizePassBase<QuantizePass> {
   using impl::QuantizePassBase<QuantizePass>::QuantizePassBase;
 
   explicit QuantizePass(const bool enable_per_channel_quantized_weight,
+                        const bool enable_full_int_quantization,
                         const bool enable_weight_only,
                         const QuantizationSpecs& quant_specs) {
     enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
+    enable_full_int_quantization_ = enable_full_int_quantization;
     enable_weight_only_ = enable_weight_only;
   }
 
@@ -120,6 +122,11 @@ void QuantizePass::runOnOperation() {
   PopulateComputeHeavyPatterns(ctx, patterns,
                                enable_per_channel_quantized_weight_);
 
+  // Quantize all quantizable ops, including ops that are not compute-heavy.
+  if (enable_full_int_quantization_) {
+    PopulateAllQuantizablePatterns(ctx, patterns);
+  }
+
   if (failed(applyPatternsAndFoldGreedily(module_op, std::move(patterns)))) {
     // There are cases where no rewrites happen even if a pattern matches,
     // causing this to result in a convergence failure. Consider this as a
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
@@ -55,8 +55,9 @@ class QuantizeCompositeFunctionsPass
 
   explicit QuantizeCompositeFunctionsPass(
       const bool enable_per_channel_quantized_weight,
-      const bool enable_weight_only) {
+      const bool enable_weight_only, const bool enable_full_int_quantization) {
     enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
+    enable_full_int_quantization_ = enable_full_int_quantization;
     enable_weight_only_ = enable_weight_only;
   }
 
@@ -89,6 +90,8 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
   QuantizePassOptions quantize_options;
   quantize_options.enable_per_channel_quantized_weight_ =
       enable_per_channel_quantized_weight_;
+  quantize_options.enable_full_int_quantization_ =
+      enable_full_int_quantization_;
   quantize_options.enable_weight_only_ = enable_weight_only_;
   // QuantizePass modifies FuncOps referenced outside of its given scope
   // and therefore requires a module-level context.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h
@@ -23,9 +23,10 @@ namespace mlir::quant::stablehlo::testing {
 // `TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass`. The pass
 // option argument is specified in line comments for each enum value.
 enum class TestQuantizationSpecs {
-  kEmpty,                 // empty
-  kDisableAllDotGeneral,  // disable-all-dot-general
-  kStaticRangePtqToAll,   // static-range-ptq-to-all
+  kEmpty,                         // empty
+  kDisableAllDotGeneral,          // disable-all-dot-general
+  kStaticRangePtqToAll,           // static-range-ptq-to-all
+  kStaticRangePtqToComputeHeavy,  // static-range-ptq-to-compute-heavy
 };
 
 // Adds generated pass default constructors or options definitions.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td
@@ -80,7 +80,9 @@ def TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass :
         clEnumValN(mlir::quant::stablehlo::testing::TestQuantizationSpecs::kDisableAllDotGeneral,
           "disable-all-dot-general", "Disables all dot_general ops by matching lifted function names"),
         clEnumValN(mlir::quant::stablehlo::testing::TestQuantizationSpecs::kStaticRangePtqToAll,
-          "static-range-ptq-to-all", "Applies `StaticRangePtq` to all quantizable units.")
+          "static-range-ptq-to-all", "Applies `StaticRangePtq` to all quantizable units."),
+        clEnumValN(mlir::quant::stablehlo::testing::TestQuantizationSpecs::kStaticRangePtqToComputeHeavy,
+          "static-range-ptq-to-compute-heavy", "Applies `StaticRangePtq` to only compute heavy units.")
       )}]>
   ];
   let dependentDialects = [
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc
@@ -62,6 +62,15 @@ constexpr absl::string_view kSpecsStaticRangePtqToAll =
            method { static_range_ptq {} }
          }])pb";
 
+// Configure `QuantizationSpecs` to apply `StaticRangePtq` to compute heavy
+// units.
+constexpr absl::string_view kSpecsStaticRangePtqToComputeHeavy =
+    R"pb(specs
+         [ {
+           matcher { function_name { regex: "^.*(conv|dot|gather).*" } }
+           method { static_range_ptq {} }
+         }])pb";
+
 class TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass
     : public impl::
           TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPassBase<
@@ -88,6 +97,8 @@ absl::string_view GetQuantizationSpecsTextProto(
       return kSpecsDisableAllDotGeneral;
     case TestQuantizationSpecs::kStaticRangePtqToAll:
       return kSpecsStaticRangePtqToAll;
+    case TestQuantizationSpecs::kStaticRangePtqToComputeHeavy:
+      return kSpecsStaticRangePtqToComputeHeavy;
   }
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
@@ -442,7 +442,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
         testing.get_size_ratio(
             self._output_saved_model_path, self._input_saved_model_path
         ),
-        0.6,
+        0.61,
     )
 
   @parameterized.parameters(
@@ -931,7 +931,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
         testing.get_size_ratio(
             self._output_saved_model_path, self._input_saved_model_path
         ),
-        0.4,
+        0.46,
     )
 
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
@@ -53,7 +53,7 @@ message RepresentativeDatasetConfig {
 //     channel dimension, which assumes the weight tensor is in NHWC format.
 //   * Applies static-range PTQ for all other ops.
 //
-// Next ID: 3
+// Next ID: 4
 message StaticRangePtqPreset {
   // Configures representative dataset. Each item corresponds to a
   // representative dataset used to calibrate a function.
@@ -72,6 +72,9 @@ message StaticRangePtqPreset {
   //
   // Default value: true
   bool enable_per_channel_quantized_weight = 2 [deprecated = true];
+
+  // Whether to quantize all quantizable ops or only compute-heavy ops.
+  bool enable_full_int_quantization = 3;
 }
 
 // Applies int8 per-tensor weight-only quantization for all dot_general op.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_all_ops.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions_all_ops.mlir
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py