int8 output for seq embeddings (#2316)

Leon Gao · facebook-github-bot · commit ea763e9a91ed · 2024-02-06T13:51:29.000-08:00
Summary:

* int8 output dtype is a gap for recently fbgemm usage case, setup a reasonable refimplementation first, memcpy based.
* for sequence embedding, we first unblock dispatch via simple memcpy, it is a pure bw op(no dequant) so memcpy should be reasonably ok. further optimization like ILP via unrolling, try avx non-temp instruction, rep instruction to be done in future iterations.

Differential Revision: D53449813
diff --git a/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp
@@ -166,20 +166,22 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     }
 
     Tensor output;
-    const int kINT8QparamsBytes = 8;
     SparseType o_dtype = static_cast<SparseType>(output_dtype);
     TORCH_CHECK(o_dtype == SparseType::FP32 || o_dtype == SparseType::FP16 || o_dtype == SparseType::INT8 || o_dtype == SparseType::BF16);
     bool output_is_bf16 = o_dtype == SparseType::BF16;
+    bool output_is_int8 = o_dtype == SparseType::INT8;
     {% if not nobag %}
+    const int kINT8QparamsBytes = 8;
     int64_t total_adjusted_D = total_D;
     if (o_dtype == SparseType::INT8) {
       total_adjusted_D += T * kINT8QparamsBytes;
     }
     output = at::empty({B, total_adjusted_D}, dev_weights.options().dtype(getScalarType(o_dtype)).pinned_memory(pinned_memory));
     {% else %}
+    const int kINT8QparamsBytes = 4; // no bag int8 output aligns with fbgemm weights storage size and layout
     int64_t adjusted_D = D;
     if (o_dtype == SparseType::INT8) {
-      adjusted_D += T * kINT8QparamsBytes;
+      adjusted_D += kINT8QparamsBytes;
     }
     output = at::empty({total_L, adjusted_D}, dev_weights.options().dtype(getScalarType(o_dtype)).pinned_memory(pinned_memory));
 
@@ -202,11 +204,15 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
 
         using float16 = uint16_t;
         using bfloat16 = uint16_t;
-        using fbgemm_out_t = typename std::conditional<
+        using int8 = uint8_t;
+        using base_fbgemm_out_t = typename std::conditional<
+            std::is_same<output_t, at::Half>::value,
+            float16,
+            std::conditional<std::is_same<output_t, at::BFloat16>::value, bfloat16, std::conditional<std::is_same<output_t, float>::value, float, int8>::type> ::type >::type;
+        using other_fbgemm_out_t = typename std::conditional<
             std::is_same<output_t, at::Half>::value,
             float16,
             std::conditional<std::is_same<output_t, at::BFloat16>::value, bfloat16, float>::type >::type;
-
         AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_", [&] {
             const auto* indices_acc = indices.data_ptr<index_t>();
             const auto* offsets_acc = offsets.data_ptr<index_t>();
@@ -224,7 +230,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
                 const int32_t D_end = D_offsets_acc[t + 1];
                 const int32_t D = D_end - D_start;
                 {% else %}
-                const int32_t D_start = offsets_acc[t * B] * D;
+                const int32_t D_start = offsets_acc[t * B] * adjusted_D;
                 {% endif %}
 
                 const auto placement = static_cast<PlacementType>(weights_placements_ptr[t]);
@@ -233,6 +239,9 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
                 weights_acc = weight_tensor.data_ptr<uint8_t>();
                 const uint8_t* weights = &weights_acc[weights_offsets_acc[t]];
                 const auto weight_ty = static_cast<SparseType>(weights_tys_acc[t]);
+                if (output_is_int8) {
+                    TORCH_CHECK(weight_ty == SparseType::INT8, "int8 output are only supported for int8 weights");
+                }
                 // default to 1 byte alignment for CPU TBE
                 const int32_t D_bytes = nbit::padded_row_size_in_bytes(D, weight_ty, row_alignment);
 
@@ -246,6 +255,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
                 const bool normalize_by_lengths = static_cast<PoolingMode>(pooling_mode) == PoolingMode::MEAN;
 
                 const index_t index_size = offsets_acc[(t + 1) * B] - *offsets_begin_ptr;
+                const int32_t output_stride = {{ "total_D" if not nobag else "adjusted_D" }};
 
                 {% if nobag %}
                 // Create virtual offsets for the nobag case. Lengths are all ones.
@@ -256,6 +266,8 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
                 {% endif %}
 
                 const float* indice_weights_ptr = nullptr;
+                // int8 output only enabled for nobag case with ref impl
+                const bool nobag_op = {{ "false" if not nobag else "output_is_int8" }};
                 {% if weighted %}
                 indice_weights_ptr = indice_weights_acc + *offsets_begin_ptr;
                 {% endif %}
@@ -266,6 +278,13 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
                     if use_base else ("GenerateEmbeddingSpMDMNBitWithStrides"
                     if use_nbit else "GenerateEmbeddingSpMDMFP8WithStrides")
                  %}
+                using fbgemm_out_t = {{ "base_fbgemm_out_t" if use_base else "other_fbgemm_out_t" }};
+                // TODO: merge nobag int8 path with normal asmjit dispatch
+                {% if nobag %}
+                    const index_t* offset_ptr = (output_is_int8)? offsets_begin_ptr: offsets_nobag_ptr;
+                {% else %}
+                    const index_t* offset_ptr = offsets_begin_ptr;
+                {% endif %}
                 const auto kernel = fbgemm::{{ kernel_name }}<
                     {% if use_base %}
                     {{ weight_type }},
@@ -292,7 +311,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
                     {% endif %}
                     /*is_weight_positional=*/false,
                     /*use_offsets=*/true,
-                    /*output_stride=*/{{ "total_D" if not nobag else "D" }},
+                    /*output_stride=*/output_stride,
                     /*input_stride=*/D_bytes / sizeof({{ weight_type }}),
                     {% if use_fp8 %}
                     /*exponent_bits=*/fp8_exponent_bits,
@@ -302,7 +321,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
                     /*scale_bias_last=*/false,
                     {% endif %}
                     {% if use_base %}
-                    /*no_bag=*/false,
+                    /*no_bag=*/nobag_op,
                     {% endif %}
                     /*is_bf16_out=*/output_is_bf16
                 );
@@ -312,7 +331,7 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
                     num_rows,
                     reinterpret_cast<const {{ weight_type }}*>(weights),
                     indices_acc + *offsets_begin_ptr,
-                    {{ "offsets_begin_ptr" if not nobag else "offsets_nobag_ptr" }},
+                    offset_ptr,
                     indice_weights_ptr,
                     reinterpret_cast<fbgemm_out_t*>(output_acc + D_start));
                 {% endmacro %}
diff --git a/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py b/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py
@@ -11,7 +11,7 @@
 import random
 import unittest
 
-from typing import Callable, Dict, List, Optional
+from typing import Callable, Dict, List, Optional, Tuple
 
 import hypothesis.strategies as st
 import numpy as np
@@ -91,6 +91,9 @@ def get_nbit_weights_ty(draw) -> Optional[SparseType]:
     "test_faketensor__test_nbit_forward_gpu_no_cache_fp8_2048": [
         unittest.skip("Operator not implemented for Meta tensors"),
     ],
+    "test_faketensor__test_nbit_forward_cpu_seq_int8": [
+        unittest.skip("Operator not implemented for Meta tensors"),
+    ],
 }
 
 
@@ -838,6 +841,100 @@ def test_nbit_forward_uvm_cache(
             output_ref = cc_ref(indices, offsets)
             torch.testing.assert_close(output, output_ref, equal_nan=True)
 
+    @given(
+        D=st.sampled_from([32, 256, 384, 512, 1024]),
+        B=st.integers(min_value=8, max_value=32),
+        T=st.integers(min_value=10, max_value=20),
+        L=st.integers(min_value=10, max_value=100),
+        MAXH=st.integers(min_value=50, max_value=100),
+    )
+    @settings(
+        verbosity=VERBOSITY,
+        max_examples=MAX_EXAMPLES_LONG_RUNNING,
+        deadline=None,
+    )
+    def test_nbit_forward_cpu_seq_int8(
+        self,
+        D: int,
+        B: int,
+        T: int,
+        L: int,
+        MAXH: int,
+    ) -> None:
+        """
+        we init a quant table split embedding bag with int8 weights and scale of 1 and 0 bias
+        and compare brute force table lookup vs tbe based int8 output lookup.
+        """
+        pooling_mode = PoolingMode.NONE
+
+        nbit_weights_ty = SparseType.INT8
+        D_alignment = (
+            1
+            if nbit_weights_ty.bit_rate() % 8 == 0
+            else int(8 / nbit_weights_ty.bit_rate())
+        )
+        D = round_up(D, D_alignment)
+        T_H = [np.random.randint(low=1, high=MAXH + 1) for _ in range(T)]
+        quant_cc = IntNBitTableBatchedEmbeddingBagsCodegen(
+            embedding_specs=[
+                (
+                    "",
+                    H,
+                    D,
+                    nbit_weights_ty,
+                    EmbeddingLocation.HOST,
+                )
+                for H in T_H
+            ],
+            pooling_mode=pooling_mode,
+            device="cpu",
+            output_dtype=nbit_weights_ty,
+        )
+        # Initialize the random weights for int nbit table split embedding bag
+        quant_cc.fill_random_weights()
+        raw_embedding_weights = quant_cc.split_embedding_weights()
+        # we mimic 1.0 scale, 0.0 bias for better results comparison
+        embedding_weights: List[Tuple[torch.Tensor, Optional[torch.Tensor]]] = [
+            (table_weight, torch.tensor([1, 0], dtype=torch.float16).view(torch.uint8))
+            for table_weight, _ in raw_embedding_weights
+        ]
+        # Initialize the random weights for int8 nbit table split embedding bag
+        quant_cc.assign_embedding_weights(embedding_weights)
+        lengths_list = [
+            torch.randint(
+                1,
+                L + 1,
+                (B,),
+            )
+            for _ in range(T)
+        ]
+        indices_list = [
+            torch.randint(0, H, (int(length.sum().item()),))
+            for length, H in zip(lengths_list, T_H)
+        ]
+        indices = torch.cat(indices_list, 0)
+        lengths = torch.cat(lengths_list, 0)
+        offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+        quant_cc_output = quant_cc(indices.int(), offsets.int())
+        tables_rows = [
+            T for T, _, _ in quant_cc.split_embedding_weights_with_scale_bias(0)
+        ]
+        ref_output = torch.cat(
+            [
+                table_rows[indice_table]
+                for indice_table, table_rows in zip(indices_list, tables_rows)
+            ],
+            dim=0,
+        )
+        torch.testing.assert_close(
+            quant_cc_output.cpu(),
+            ref_output.cpu(),
+            rtol=1e-2,
+            atol=1e-2,
+            equal_nan=False,
+        )
+
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/src/EmbeddingSpMDM.cc b/src/EmbeddingSpMDM.cc
@@ -1540,19 +1540,29 @@ GenerateEmbeddingSpMDMRowWiseSparse(
 #define INSTANTIATE_SPMDMFP8_BASE_float(INDEX_TYPE, OFFSET_TYPE, OUT_TYPE)
 #define INSTANTIATE_SPMDMFP8_BASE_uint16_t(INDEX_TYPE, OFFSET_TYPE, OUT_TYPE)
 
-#define INSTANTIATE_SPMDM_THREAD_LOCAL(                                     \
-    IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE)                             \
-  INSTANTIATE_SPMDM_BASE(IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE, true)  \
-  INSTANTIATE_SPMDM_BASE(IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE, false) \
-  INSTANTIATE_SPMDM_NOSTRIDE_BASE(                                          \
-      IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE, true)                     \
-  INSTANTIATE_SPMDM_NOSTRIDE_BASE(                                          \
-      IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE, false)                    \
+#define INSTANTIATE_SPMDM_BASE_THREAD_LOCAL(                               \
+    IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE)                            \
+  INSTANTIATE_SPMDM_BASE(IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE, true) \
+  INSTANTIATE_SPMDM_BASE(IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE, false)
+
+#define INSTANTIATE_SPMDM_NON_BASE_THREAD_LOCAL(         \
+    IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE)          \
+  INSTANTIATE_SPMDM_NOSTRIDE_BASE(                       \
+      IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE, true)  \
+  INSTANTIATE_SPMDM_NOSTRIDE_BASE(                       \
+      IN_TYPE, INDEX_TYPE, OFFSET_TYPE, OUT_TYPE, false) \
   INSTANTIATE_SPMDMFP8_BASE_##IN_TYPE(INDEX_TYPE, OFFSET_TYPE, OUT_TYPE)
 
-#define INSTANTIATE_SPMDM_OUT_T(IN_TYPE, INDEX_TYPE, OFFSET_TYPE)            \
-  INSTANTIATE_SPMDM_THREAD_LOCAL(IN_TYPE, INDEX_TYPE, OFFSET_TYPE, float)    \
-  INSTANTIATE_SPMDM_THREAD_LOCAL(IN_TYPE, INDEX_TYPE, OFFSET_TYPE, uint16_t) \
+#define INSTANTIATE_SPMDM_OUT_T(IN_TYPE, INDEX_TYPE, OFFSET_TYPE)              \
+  INSTANTIATE_SPMDM_BASE_THREAD_LOCAL(IN_TYPE, INDEX_TYPE, OFFSET_TYPE, float) \
+  INSTANTIATE_SPMDM_BASE_THREAD_LOCAL(                                         \
+      IN_TYPE, INDEX_TYPE, OFFSET_TYPE, uint16_t)                              \
+  INSTANTIATE_SPMDM_BASE_THREAD_LOCAL(                                         \
+      IN_TYPE, INDEX_TYPE, OFFSET_TYPE, uint8_t)                               \
+  INSTANTIATE_SPMDM_NON_BASE_THREAD_LOCAL(                                     \
+      IN_TYPE, INDEX_TYPE, OFFSET_TYPE, float)                                 \
+  INSTANTIATE_SPMDM_NON_BASE_THREAD_LOCAL(                                     \
+      IN_TYPE, INDEX_TYPE, OFFSET_TYPE, uint16_t)                              \
   INSTANTIATE_SPMDM_ROWWISE_BASE(IN_TYPE, INDEX_TYPE, OFFSET_TYPE)
 
 #define INSTANTIATE_SPMDM_OFFSET_T(IN_TYPE, INDEX_TYPE) \
diff --git a/src/RefImplementations.cc b/src/RefImplementations.cc