From 600f5cbdecb027be2488297cca6a825b1f56af9a Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Mon, 25 Jan 2021 00:02:30 +0000
Subject: [PATCH] Integrate intgemm into marian (#595)

Adds intgemm as a module for Marian. Intgemm is @kpu 's 8/16 bit gemm library with support for architectures from SSE2 to AVX512VNNI
Removes outdated integer code, related to the --optimize option

Co-authored-by: Kenneth Heafield <github@kheafield.com>
Co-authored-by: Kenneth Heafield <kpu@users.noreply.github.com>
Co-authored-by: Ulrich Germann <ugermann@inf.ed.ac.uk>
Co-authored-by: Marcin Junczys-Dowmunt <marcinjd@microsoft.com>
Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
---
 .gitmodules                                   |   3 +
 CHANGELOG.md                                  |   5 +
 CMakeLists.txt                                |   5 +-
 regression-tests                              |   2 +-
 src/3rd_party/CMakeLists.txt                  |   5 +
 src/3rd_party/intgemm                         |   1 +
 src/CMakeLists.txt                            |   8 +-
 src/command/marian_conv.cpp                   |  28 +-
 src/common/binary.cpp                         |  21 +-
 src/common/config_parser.cpp                  |  15 +-
 src/common/types.cpp                          |  11 +-
 src/common/types.h                            | 174 +++--
 src/data/shortlist.h                          |   8 +
 src/embedder/embedder.h                       |   5 -
 src/graph/expression_operators.cpp            |  48 +-
 src/microsoft/quicksand.cpp                   |   3 +-
 src/rescorer/rescorer.h                       |   5 -
 src/tensors/backend.h                         |  13 +-
 src/tensors/cpu/aligned.h                     |  43 ++
 src/tensors/cpu/backend.h                     |   8 +-
 src/tensors/cpu/device.cpp                    |  38 +-
 src/tensors/cpu/expression_graph_packable.h   | 266 ++++++++
 src/tensors/cpu/fbgemm/expanded_gemm.h        |  19 +-
 .../cpu/fbgemm/expression_graph_packable.h    | 156 -----
 src/tensors/cpu/int16.h                       | 113 ----
 src/tensors/cpu/integer_common.cpp            |  45 ++
 src/tensors/cpu/integer_common.h              | 223 +++++++
 src/tensors/cpu/intgemm_interface.h           | 132 ++++
 src/tensors/cpu/prod.cpp                      |  13 +-
 src/tensors/cpu/sharp/avx_gemm.cpp            | 615 ------------------
 src/tensors/cpu/sharp/int_gemm.cpp            | 187 ------
 src/tensors/cpu/sharp/int_gemm.h              |  36 -
 src/tensors/cpu/sharp/sse_gemm.cpp            | 341 ----------
 src/tensors/gpu/backend.h                     |  10 -
 src/tests/prod.cpp                            |  43 +-
 src/training/graph_group_async.cpp            |   1 -
 src/training/graph_group_singleton.h          |   1 -
 src/training/graph_group_sync.cpp             |   1 -
 src/translator/translator.h                   |   8 -
 39 files changed, 986 insertions(+), 1673 deletions(-)
 create mode 160000 src/3rd_party/intgemm
 create mode 100644 src/tensors/cpu/aligned.h
 create mode 100644 src/tensors/cpu/expression_graph_packable.h
 delete mode 100644 src/tensors/cpu/fbgemm/expression_graph_packable.h
 delete mode 100644 src/tensors/cpu/int16.h
 create mode 100644 src/tensors/cpu/integer_common.cpp
 create mode 100644 src/tensors/cpu/integer_common.h
 create mode 100644 src/tensors/cpu/intgemm_interface.h
 delete mode 100644 src/tensors/cpu/sharp/avx_gemm.cpp
 delete mode 100644 src/tensors/cpu/sharp/int_gemm.cpp
 delete mode 100644 src/tensors/cpu/sharp/int_gemm.h
 delete mode 100644 src/tensors/cpu/sharp/sse_gemm.cpp

diff --git a/.gitmodules b/.gitmodules
index 6cb63fc0b..a1a876d8b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -14,6 +14,9 @@
 	path = src/3rd_party/fbgemm
 	url = https://github.com/marian-nmt/FBGEMM
 	branch = master
+[submodule "src/3rd_party/intgemm"]
+	path = src/3rd_party/intgemm
+	url = https://github.com/marian-nmt/intgemm/
 [submodule "src/3rd_party/simple-websocket-server"]
 	path = src/3rd_party/simple-websocket-server
 	url = https://github.com/marian-nmt/Simple-WebSocket-Server
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0e71c7daf..3ad291ee8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+- Added `intgemm8(ssse3|avx|avx512)?`, `intgemm16(sse2|avx|avx512)?` types to marian-conv with uses intgemm backend. Types intgemm8 and intgemm16 are hardware-agnostic, the other ones hardware-specific.
+- Shortlist is now always multiple-of-eight.
+- Added intgemm 8/16bit integer binary architecture agnostic format.
 - Add --train-embedder-rank for fine-tuning any encoder(-decoder) model for multi-lingual similarity via softmax-margin loss
 - Add --logical-epoch that allows to redefine the displayed epoch counter as a multiple of n data epochs, updates or labels. Also allows to define width of fractional part with second argument.
 - Add --metrics chrf for computing ChrF according to https://www.aclweb.org/anthology/W15-3049/ and SacreBLEU reference implementation
@@ -56,6 +59,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Fix the runtime failures for FASTOPT on 32-bit builds (wasm just happens to be 32-bit) because it uses hashing with an inconsistent mix of uint64_t and size_t.
 
 ### Changed
+- Remove `--clip-gemm` which is obsolete and was never used anyway
+- Removed `--optimize` switch, instead we now determine compute type based on binary model.
 - Updated SentencePiece repository to version 8336bbd0c1cfba02a879afe625bf1ddaf7cd93c5 from https://github.com/google/sentencepiece.
 - Enabled compilation of SentencePiece by default since no dependency on protobuf anymore.
 - Changed default value of --sentencepiece-max-lines from 10000000 to 2000000 since apparently the new version doesn't sample automatically anymore (Not quite clear how that affects quality of the vocabulary).
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 343a3e3b0..aa43708c8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -93,8 +93,8 @@ if(MSVC)
   # Or maybe use these?
   set(INTRINSICS "/arch:AVX2")
   # set(INTRINSICS "/arch:AVX512")
-
-  set(CMAKE_CXX_FLAGS           "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS ${DISABLE_GLOBALLY}")
+  # /bigobj is necessary for expression_operators.cpp. See https://stackoverflow.com/questions/15110580/penalty-of-the-msvs-compiler-flag-bigobj
+  set(CMAKE_CXX_FLAGS           "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}")
   set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG")
   set(CMAKE_CXX_FLAGS_DEBUG     "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG")
 
@@ -438,6 +438,7 @@ endif(USE_MPI)
 ###############################################################################
 # Find BLAS library
 if(COMPILE_CPU)
+  set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU
   if(USE_APPLE_ACCELERATE)
     if(NOT APPLE)
       message(FATAL_ERROR "FATAL ERROR: Apple Accelerate only works on macOS.")
diff --git a/regression-tests b/regression-tests
index 16914ae94..97b2f95ab 160000
--- a/regression-tests
+++ b/regression-tests
@@ -1 +1 @@
-Subproject commit 16914ae94c80f338c678f0461c4e45965149f6aa
+Subproject commit 97b2f95abab6134c1632b286e373e513ecc52020
diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt
index 4013d2607..0b81e436c 100644
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@@ -8,6 +8,11 @@ add_subdirectory(./zlib)
 add_subdirectory(./faiss)
 include_directories(./faiss)
 
+if(COMPILE_CPU)
+  set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests")
+  add_subdirectory(./intgemm)
+endif(COMPILE_CPU)
+
 if(USE_FBGEMM)
   # @TODO: find out if this is somehow harmful. This is supppressing CMake warnings for CMAKE_SUPPRESS_DEVELOPER_WARNINGS
   # meant to silence CMakeFiles of 3rd_party tools.
diff --git a/src/3rd_party/intgemm b/src/3rd_party/intgemm
new file mode 160000
index 000000000..874ceebbf
--- /dev/null
+++ b/src/3rd_party/intgemm
@@ -0,0 +1 @@
+Subproject commit 874ceebbf53a85691b326495100b6361a2166cec
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6dcf7fd89..64112ffe7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -5,6 +5,8 @@ include_directories(3rd_party)
 include_directories(3rd_party/SQLiteCpp/include)
 include_directories(3rd_party/sentencepiece)
 include_directories(3rd_party/fbgemm/include)
+include_directories(3rd_party/intgemm)
+include_directories(${CMAKE_BINARY_DIR}/src/3rd_party/intgemm) # running cmake on the intgemm submodule triggers config file generation in this directory.
 include_directories(${CMAKE_BINARY_DIR}/local/include)
 
 set(MARIAN_SOURCES
@@ -41,6 +43,7 @@ set(MARIAN_SOURCES
 
   3rd_party/cnpy/cnpy.cpp
   3rd_party/ExceptionWithCallStack.cpp
+
   3rd_party/onnx/protobuf/onnx-ml.pb-wrapper.cpp
 
   3rd_party/phf/phf.cc
@@ -52,10 +55,7 @@ set(MARIAN_SOURCES
   tensors/cpu/prod.cpp
   tensors/cpu/topk.cpp
   tensors/cpu/tensor_operators.cpp
-
-  tensors/cpu/sharp/int_gemm.cpp
-  tensors/cpu/sharp/avx_gemm.cpp
-  tensors/cpu/sharp/sse_gemm.cpp
+  tensors/cpu/integer_common.cpp
   tensors/cpu/fbgemm/packed_gemm.cpp
 
   graph/expression_graph.cpp
diff --git a/src/command/marian_conv.cpp b/src/command/marian_conv.cpp
index 9bdcfb405..26cac858f 100644
--- a/src/command/marian_conv.cpp
+++ b/src/command/marian_conv.cpp
@@ -1,12 +1,10 @@
 #include "marian.h"
-
 #include "common/cli_wrapper.h"
+#include "tensors/cpu/expression_graph_packable.h"
+#include "onnx/expression_graph_onnx_exporter.h"
 
 #include <sstream>
 
-#include "tensors/cpu/fbgemm/expression_graph_packable.h"
-#include "onnx/expression_graph_onnx_exporter.h"
-
 int main(int argc, char** argv) {
   using namespace marian;
 
@@ -24,7 +22,9 @@ int main(int argc, char** argv) {
     cli->add<std::string>("--from,-f", "Input model", "model.npz");
     cli->add<std::string>("--to,-t", "Output model", "model.bin");
     cli->add<std::string>("--export-as", "Kind of conversion: marian-bin or onnx-{encode,decoder-step,decoder-init,decoder-stop}", "marian-bin");
-    cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8avx2, packed8avx512", "float32");
+    cli->add<std::string>("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8avx2, packed8avx512, "
+                          "intgemm8, intgemm8ssse3, intgemm8avx2, intgemm8avx512, intgemm16, intgemm16sse2, intgemm16avx2, intgemm16avx512", 
+                          "float32");
     cli->add<std::vector<std::string>>("--vocabs,-V", "Vocabulary file, required for ONNX export");
     cli->parse(argc, argv);
     options->merge(config);
@@ -35,19 +35,8 @@ int main(int argc, char** argv) {
   auto exportAs = options->get<std::string>("export-as");
   auto vocabPaths = options->get<std::vector<std::string>>("vocabs");// , std::vector<std::string>());
   
-  auto saveGemmTypeStr = options->get<std::string>("gemm-type", "float32");
-  Type saveGemmType;
-  if(saveGemmTypeStr == "float32") {
-    saveGemmType = Type::float32;
-  } else if(saveGemmTypeStr == "packed16") {  // packed16 only supports AVX2. AVX512 might be added later
-    saveGemmType = Type::packed16;
-  } else if(saveGemmTypeStr == "packed8avx2") { // packed8 for AVX2
-    saveGemmType = Type::packed8avx2;
-  } else if(saveGemmTypeStr == "packed8avx512") { // packed8 for AVX512
-    saveGemmType = Type::packed8avx512;
-  } else {
-    ABORT("Unknown gemm-type: {}", saveGemmTypeStr);
-  }
+  // We accept any type here and will later croak during packAndSave if the type cannot be used for conversion
+  Type saveGemmType = typeFromString(options->get<std::string>("gemm-type", "float32"));
 
   LOG(info, "Outputting {}, precision: {}", modelTo, saveGemmType);
 
@@ -58,12 +47,11 @@ int main(int argc, char** argv) {
 
   auto load = [&](Ptr<ExpressionGraph> graph) {
     graph->setDevice(CPU0);
-    graph->getBackend()->setOptimized(false);
-
     graph->load(modelFrom);
     graph->forward();  // run the initializers
   };
 
+
   if (exportAs == "marian-bin") {
     auto graph = New<ExpressionGraphPackable>();
     load(graph);
diff --git a/src/common/binary.cpp b/src/common/binary.cpp
index 1531fed6c..60f651f2d 100644
--- a/src/common/binary.cpp
+++ b/src/common/binary.cpp
@@ -3,6 +3,7 @@
 #include "common/file_stream.h"
 #include "common/io_item.h"
 #include "common/types.h"
+#include "tensors/cpu/integer_common.h"
 
 #include <string>
 
@@ -57,13 +58,31 @@ void loadItems(const void* current, std::vector<io::Item>& items, bool mapped) {
   get<char>(current, offset);
 
   for(int i = 0; i < numHeaders; ++i) {
+    // For intgemm AVX512 and AVX512VNNI have the same arangement, but the VNNI algorithm is faster.
+    // Change the type to the fastest one supported.
+    if (items[i].type == Type::intgemm8avx512) {
+      items[i].type = cpu::integer::getIntgemmType(Type::intgemm8);
+    }
     if(items[i].mapped) { // memory-mapped, hence only set pointer
+      // @TOOD: verify this actually works for the hardware-specific ones like intgemm8avx2
+      ABORT_IF(items[i].type == Type::intgemm8 || items[i].type == Type::intgemm16, "mmap format not supported for hardware non-specific intgemm matrices");
       items[i].ptr = get<char>(current, headers[i].dataLength);
     } else { // reading into item data
       size_t len = headers[i].dataLength;
       items[i].bytes.resize(len);
       const char* ptr = get<char>(current, len);
-      std::copy(ptr, ptr + len, items[i].bytes.begin());
+      // Intgemm8/16 matrices in binary model are just quantized, however they also need to be reordered
+      // Reordering depends on the architecture (SSE/AVX2/AVX512) so we read in the quantized matrices and
+      // then reorder them before adding them as a parameter in the graph.
+      if (matchType<intgemm8>(items[i].type)) {
+        items[i].type = cpu::integer::getIntgemmType(Type::intgemm8);
+        cpu::integer::prepareAndTransposeB<Type::intgemm8>(items[i], ptr);
+      } else if (matchType<intgemm16>(items[i].type)) {
+        items[i].type = cpu::integer::getIntgemmType(Type::intgemm16);
+        cpu::integer::prepareAndTransposeB<Type::intgemm16>(items[i], ptr);
+      } else {
+        std::copy(ptr, ptr + len, items[i].bytes.begin());
+      }
     }
   }
 }
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index f72475f64..1c2de918f 100755
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -134,8 +134,6 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) {
     "Suppress logging for translation");
   cli.add<size_t>("--seed",
     "Seed for all random number generators. 0 means initialize randomly");
-  cli.add<float>("--clip-gemm",
-    "If not 0 clip GEMM input values to +/- arg");
   cli.add<bool>("--interpolate-env-vars",
     "allow the use of environment variables in paths, of the form ${VAR_NAME}");
   cli.add<bool>("--relative-paths",
@@ -671,15 +669,13 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
   addSuboptionsDevices(cli);
   addSuboptionsBatching(cli);
 
-  cli.add<bool>("--optimize",
-      "Optimize speed aggressively sacrificing memory or precision");
-  cli.add<bool>("--skip-cost",
-      "Ignore model cost during translation, not recommended for beam-size > 1");
   cli.add<bool>("--fp16",
       "Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
   cli.add<std::vector<std::string>>("--precision",
       "Mixed precision for inference, set parameter type in expression graph",
       {"float32"});
+  cli.add<bool>("--skip-cost",
+    "Ignore model cost during translation, not recommended for beam-size > 1");
 
   cli.add<std::vector<std::string>>("--shortlist",
      "Use softmax shortlist: path first best prune");
@@ -737,8 +733,6 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
   addSuboptionsDevices(cli);
   addSuboptionsBatching(cli);
 
-  cli.add<bool>("--optimize",
-      "Optimize speed aggressively sacrificing memory or precision");
   cli.add<bool>("--fp16",
       "Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
   cli.add<std::vector<std::string>>("--precision",
@@ -776,12 +770,10 @@ void ConfigParser::addOptionsEmbedding(cli::CLIWrapper& cli) {
   addSuboptionsDevices(cli);
   addSuboptionsBatching(cli);
 
-  cli.add<bool>("--optimize",
-      "Optimize speed aggressively sacrificing memory or precision");
   cli.add<bool>("--fp16",
       "Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
   cli.add<std::vector<std::string>>("--precision",
-      "Mixed precision for inference, set parameter type in expression graph",
+      "Mixed precision for inference, set parameter type in expression graph. Supported values: float32, float16",
       {"float32"});
 
   cli.switchGroup(previous_group);
@@ -934,7 +926,6 @@ void ConfigParser::addSuboptionsQuantization(cli::CLIWrapper& cli) {
   // clang-format on
 }
 
-
 cli::mode ConfigParser::getMode() const { return mode_; }
 
 Ptr<Options> ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
diff --git a/src/common/types.cpp b/src/common/types.cpp
index f358cdb62..76cc13f06 100644
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -26,13 +26,16 @@ size_t requiredBytes(const Shape& shape, Type type) {
       ABORT("Not a supported data type: {}", type);
       return 0;
     }
+  }
+#endif  // USE_FBGEMM 
+
+  if (isIntgemm(type)) {
+    /* Intgemm tensors have an extra float at the back that stores the quantization multiplier */
+    return shape.elements() * sizeOf(type) + sizeOf(Type::float32);
   } else {
     return shape.elements() * sizeOf(type);
   }
-#else
-  return shape.elements() * sizeOf(type);
-#endif  // USE_FBGEMM
   
 }
 
-}
\ No newline at end of file
+} // namespace marian
\ No newline at end of file
diff --git a/src/common/types.h b/src/common/types.h
index 4bc4f9ad0..0f70bb228 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -135,20 +135,26 @@ do { \
 namespace marian {
 
 // small struct to enable templating based on types use for packing
-struct packed16 {
-  uint16_t x;
-};
+struct packed16 { uint16_t x; };
 
 // small struct to enable templating based on types use for packing. This is a memory holder.
 // There's no difference between packed8avx2 and packed8avx512. But, they are separately defined to be distinguished.
-struct packed8avx2 {
-  uint8_t x;
-};
+struct packed8avx2   { uint8_t x; };
+struct packed8avx512 { uint8_t x; };
+
+// similar to the packed16, but to use with 16bit intgemm model packing.
+struct intgemm16       { int16_t x; };
+struct intgemm16sse2   { int16_t x; };
+struct intgemm16avx2   { int16_t x; };
+struct intgemm16avx512 { int16_t x; };
+
+// similar to packed8* but for intgemm 8bit model packing.
+struct intgemm8            { int8_t x;  };
+struct intgemm8ssse3       { int8_t x;  };
+struct intgemm8avx2        { int8_t x;  };
+struct intgemm8avx512      { int8_t x;  };
+struct intgemm8avx512vnni  { int8_t x;  };
 
-// small struct to enable templating based on types use for packing. This is a memory holder.
-struct packed8avx512 {
-  uint8_t x;
-};
 
 #ifndef __CUDACC__ // vectorized types not available from .cu files
 
@@ -214,17 +220,22 @@ struct float32x8 {
 #endif
 
 // Internal to types.h, don't use. Use test functions below.
-enum class TypeClass : size_t {
-  signed_type   = 0x0100,
-  unsigned_type = 0x0200,
-  float_type    = 0x0400,
-
-  packed_type   = 0x0800, // special packed (CPU cache friendly) type class, used in FBGEMM, not meant to be used anywhere else
-  avx2_type     = 0x1000, // processor-specific layout for avx2, currently used for FBGEMM only
-  avx512_type   = 0x2000, // processor-specific layout for avx512, currently used for FBGEMM only
-
-  size_mask     = 0x00FF,
-  class_mask    = 0xFF00
+enum class TypeClass : size_t { // size_type has 8 bytes, so we can have 16 fields here, currently using 5. Extend to the left for back-compat.
+  // built-in type classes
+  signed_type   = 0x00100,
+  unsigned_type = 0x00200,
+  float_type    = 0x00400,
+
+  avx2_type     = 0x01000, // processor-specific layout for avx2, currently used for FBGEMM only (keep 0x1000 for back-compat)
+  avx512_type   = 0x02000, // processor-specific layout for avx512, currently used for FBGEMM only (keep 0x2000 for back-compat)
+  sse2_type     = 0x04000, // processor-specific layout for sse2, currently used for Intgemm only
+  ssse3_type    = 0x08000, // processor-specific layout for ssse3, currently used for Intgemm only
+
+  packed_type   = 0x00800, // special packed (CPU cache friendly) type class, used in FBGEMM. Annoyingly we need to keep 0x800 for back-compat, would be nicer to align with intgemm
+  intgemm_type  = 0x10000, // intgemm quantized architecture agnostic models
+
+  size_mask     = 0x000FF, // maximum allowed size is 256 bytes right now; if more are required, extend the size field
+  class_mask    = 0xFFF00, // three fields for different type classes, if more classes are added we need to increase the number of fields here
 };
 
 constexpr inline size_t operator+(TypeClass typeClass, size_t val) {
@@ -251,10 +262,21 @@ enum class Type : size_t {
   float32  = TypeClass::float_type + 4u,
   float64  = TypeClass::float_type + 8u,
 
-  packed16      = TypeClass::packed_type + 2u,                          // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
-  packed8avx2   = TypeClass::packed_type + 1u + TypeClass::avx2_type,   // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
-  packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
+  packed16            = TypeClass::packed_type + 2u,                                   // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
+  packed8avx2         = TypeClass::packed_type + 1u + TypeClass::avx2_type,            // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
+  packed8avx512       = TypeClass::packed_type + 1u + TypeClass::avx512_type,          // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
+
+  intgemm8            = TypeClass::intgemm_type + 1u,                                  // Int8 quantized (not packed) matrices for intgemm
+  intgemm16           = TypeClass::intgemm_type + 2u,                                  // Int16 quantized (not packed) matrices for intgemm
 
+  intgemm8ssse3       = TypeClass::intgemm_type + 1u + TypeClass::ssse3_type,          // Int8 quantized and packed (ssse3) matrices for intgemm
+  intgemm8avx2        = TypeClass::intgemm_type + 1u + TypeClass::avx2_type,           // Int8 quantized and packed (avx2) matrices for intgemm
+  intgemm8avx512      = TypeClass::intgemm_type + 1u + TypeClass::avx512_type,         // Int8 quantized and packed (avx512) matrices for intgemm
+  intgemm8avx512vnni  = TypeClass::intgemm_type + 1u + TypeClass::avx512_type + 4096u, // Int8 quantized and packed (avx512) matrices for intgemm. VNNI algorithm
+
+  intgemm16sse2       = TypeClass::intgemm_type + 2u + TypeClass::sse2_type,           // Int16 quantized and packed (sse2) matrices for intgemm
+  intgemm16avx2       = TypeClass::intgemm_type + 2u + TypeClass::avx2_type,           // Int16 quantized and packed (avx2) matrices for intgemm
+  intgemm16avx512     = TypeClass::intgemm_type + 2u + TypeClass::avx512_type,         // Int16 quantized and packed (avx512) matrices for intgemm
 };
 
 static inline size_t operator&(TypeClass typeClass, Type type) {
@@ -289,6 +311,14 @@ static inline bool isPacked(Type type) {
   return (TypeClass::packed_type & type) != 0;
 }
 
+static inline bool isSse2(Type type) {
+  return (TypeClass::sse2_type & type) != 0;
+}
+
+static inline bool isSsse3(Type type) {
+  return (TypeClass::ssse3_type & type) != 0;
+}
+
 static inline bool isAvx2(Type type) {
   return (TypeClass::avx2_type & type) != 0;
 }
@@ -297,6 +327,10 @@ static inline bool isAvx512(Type type) {
   return (TypeClass::avx512_type & type) != 0;
 }
 
+static inline bool isIntgemm(Type type) {
+  return (TypeClass::intgemm_type & type) != 0;
+}
+
 size_t requiredBytes(const Shape& shape, Type type); // towards Frank's vision of joint Shape/Type
 
 template <typename T>
@@ -314,13 +348,24 @@ template <> inline bool matchType<uint16_t>(Type type) { return type == Type::ui
 template <> inline bool matchType<uint32_t>(Type type) { return type == Type::uint32;   }
 template <> inline bool matchType<uint64_t>(Type type) { return type == Type::uint64;   }
 
-template <> inline bool matchType<float16>(Type type)  { return type == Type::float16;  }
-template <> inline bool matchType<float>(Type type)    { return type == Type::float32;  }
-template <> inline bool matchType<double>(Type type)   { return type == Type::float64;  }
+template <> inline bool matchType<float16>(Type type)              { return type == Type::float16;             }
+template <> inline bool matchType<float>(Type type)                { return type == Type::float32;             }
+template <> inline bool matchType<double>(Type type)               { return type == Type::float64;             }
+
+template <> inline bool matchType<packed16>(Type type)             { return type == Type::packed16;            }
+template <> inline bool matchType<packed8avx2>(Type type)          { return type == Type::packed8avx2;         }
+template <> inline bool matchType<packed8avx512>(Type type)        { return type == Type::packed8avx512;       }
 
-template <> inline bool matchType<packed16>(Type type)       { return type == Type::packed16;       }
-template <> inline bool matchType<packed8avx2>(Type type)    { return type == Type::packed8avx2;    }
-template <> inline bool matchType<packed8avx512>(Type type)  { return type == Type::packed8avx512;  }
+template <> inline bool matchType<intgemm8>(Type type)             { return type == Type::intgemm8;            }
+template <> inline bool matchType<intgemm8ssse3>(Type type)        { return type == Type::intgemm8ssse3;       }
+template <> inline bool matchType<intgemm8avx2>(Type type)         { return type == Type::intgemm8avx2;        }
+template <> inline bool matchType<intgemm8avx512>(Type type)       { return type == Type::intgemm8avx512;      }
+template <> inline bool matchType<intgemm8avx512vnni>(Type type)   { return type == Type::intgemm8avx512vnni;  }
+
+template <> inline bool matchType<intgemm16>(Type type)            { return type == Type::intgemm16;           }
+template <> inline bool matchType<intgemm16sse2>(Type type)        { return type == Type::intgemm16sse2;       }
+template <> inline bool matchType<intgemm16avx2>(Type type)        { return type == Type::intgemm16avx2;       }
+template <> inline bool matchType<intgemm16avx512>(Type type)      { return type == Type::intgemm16avx512;     }
 // clang-format on
 
 static inline std::ostream& operator<<(std::ostream& out, Type type) {
@@ -342,6 +387,16 @@ static inline std::ostream& operator<<(std::ostream& out, Type type) {
     case Type::packed16      : out << "packed16"; break;
     case Type::packed8avx2   : out << "packed8avx2"; break;
     case Type::packed8avx512 : out << "packed8avx512"; break;
+
+    case Type::intgemm8            : out << "intgemm8"; break;
+    case Type::intgemm8ssse3       : out << "intgemm8ssse3"; break;
+    case Type::intgemm8avx2        : out << "intgemm8avx2"; break;
+    case Type::intgemm8avx512      : out << "intgemm8avx512"; break;
+    case Type::intgemm8avx512vnni  : out << "intgemm8avx512vnni"; break;
+    case Type::intgemm16           : out << "intgemm16"; break;
+    case Type::intgemm16sse2       : out << "intgemm16sse2"; break;
+    case Type::intgemm16avx2       : out << "intgemm16avx2"; break;
+    case Type::intgemm16avx512     : out << "intgemm16avx512"; break;
   }
   return out;
 }
@@ -350,12 +405,12 @@ template <typename T>
 inline std::string request();
 
 // clang-format off
-template <> inline std::string request<int8_t>()  { return "int8"; }
+template <> inline std::string request<int8_t>()  { return "int8";  }
 template <> inline std::string request<int16_t>() { return "int16"; }
 template <> inline std::string request<int32_t>() { return "int32"; }
 template <> inline std::string request<int64_t>() { return "int64"; }
 
-template <> inline std::string request<uint8_t>()  { return "uint8"; }
+template <> inline std::string request<uint8_t>()  { return "uint8";  }
 template <> inline std::string request<uint16_t>() { return "uint16"; }
 template <> inline std::string request<uint32_t>() { return "uint32"; }
 template <> inline std::string request<uint64_t>() { return "uint64"; }
@@ -364,9 +419,19 @@ template <> inline std::string request<float16>()  { return "float16"; }
 template <> inline std::string request<float>()    { return "float32"; }
 template <> inline std::string request<double>()   { return "float64"; }
 
-template <> inline std::string request<packed16>() { return "packed16"; }
-template <> inline std::string request<packed8avx2>()  { return "packed8avx2"; }
-template <> inline std::string request<packed8avx512>()  { return "packed8avx512"; }
+template <> inline std::string request<packed16>()      { return "packed16";      }
+template <> inline std::string request<packed8avx2>()   { return "packed8avx2";   }
+template <> inline std::string request<packed8avx512>() { return "packed8avx512"; }
+
+template <> inline std::string request<intgemm8>()            { return "intgemm8";        }
+template <> inline std::string request<intgemm8ssse3>()       { return "intgemm8ssse3";    }
+template <> inline std::string request<intgemm8avx2>()        { return "intgemm8avx2";    }
+template <> inline std::string request<intgemm8avx512>()      { return "intgemm8avx512";  }
+template <> inline std::string request<intgemm8avx512vnni>()  { return "intgemm8avx512vnni";  }
+template <> inline std::string request<intgemm16>()           { return "intgemm16";       }
+template <> inline std::string request<intgemm16sse2>()       { return "intgemm16sse2";   }
+template <> inline std::string request<intgemm16avx2>()       { return "intgemm16avx2";   }
+template <> inline std::string request<intgemm16avx512>()     { return "intgemm16avx512"; }
 // clang-format on
 
 static Type inline typeFromString(const std::string& str) {
@@ -402,18 +467,38 @@ static Type inline typeFromString(const std::string& str) {
   if(str == "packed8avx512")
     return Type::packed8avx512;
 
+  if(str == "intgemm8")
+    return Type::intgemm8;
+  if(str == "intgemm8ssse3")
+    return Type::intgemm8ssse3;
+  if(str == "intgemm8avx2")
+    return Type::intgemm8avx2;
+  if(str == "intgemm8avx512")
+    return Type::intgemm8avx512;
+  if(str == "intgemm8avx512vnni")
+    return Type::intgemm8avx512vnni;
+
+  if(str == "intgemm16")
+    return Type::intgemm16;
+  if(str == "intgemm16sse2")
+    return Type::intgemm16sse2;
+  if(str == "intgemm16avx2")
+    return Type::intgemm16avx2;
+  if(str == "intgemm16avx512")
+    return Type::intgemm16avx512;
+
   ABORT("Unknown type {}", str);
 }
 
 template <typename T>
 inline Type typeId();
 
-template <> inline Type typeId<int8_t>()   { return Type::int8; }
+template <> inline Type typeId<int8_t>()   { return Type::int8;  }
 template <> inline Type typeId<int16_t>()  { return Type::int16; }
 template <> inline Type typeId<int32_t>()  { return Type::int32; }
 template <> inline Type typeId<int64_t>()  { return Type::int64; }
 
-template <> inline Type typeId<uint8_t>()  { return Type::uint8; }
+template <> inline Type typeId<uint8_t>()  { return Type::uint8;  }
 template <> inline Type typeId<uint16_t>() { return Type::uint16; }
 template <> inline Type typeId<uint32_t>() { return Type::uint32; }
 template <> inline Type typeId<uint64_t>() { return Type::uint64; }
@@ -422,10 +507,21 @@ template <> inline Type typeId<float16>()  { return Type::float16; }
 template <> inline Type typeId<float>()    { return Type::float32; }
 template <> inline Type typeId<double>()   { return Type::float64; }
 
-template <> inline Type typeId<packed16>()      { return Type::packed16; }
-template <> inline Type typeId<packed8avx2>()   { return Type::packed8avx2; }
+template <> inline Type typeId<packed16>()      { return Type::packed16;      }
+template <> inline Type typeId<packed8avx2>()   { return Type::packed8avx2;   }
 template <> inline Type typeId<packed8avx512>() { return Type::packed8avx512; }
 
+template <> inline Type typeId<intgemm8>()            { return Type::intgemm8;            }
+template <> inline Type typeId<intgemm8ssse3>()       { return Type::intgemm8ssse3;       }
+template <> inline Type typeId<intgemm8avx2>()        { return Type::intgemm8avx2;        }
+template <> inline Type typeId<intgemm8avx512>()      { return Type::intgemm8avx512;      }
+template <> inline Type typeId<intgemm8avx512vnni>()  { return Type::intgemm8avx512vnni;  }
+template <> inline Type typeId<intgemm16>()           { return Type::intgemm16;           }
+template <> inline Type typeId<intgemm16sse2>()       { return Type::intgemm16sse2;       }
+template <> inline Type typeId<intgemm16avx2>()       { return Type::intgemm16avx2;       }
+template <> inline Type typeId<intgemm16avx512>()     { return Type::intgemm16avx512;     }
+
+
 // Abort if given C++ does not correspond to runtime type
 template <typename T>
 void matchOrAbort(Type type) {
diff --git a/src/data/shortlist.h b/src/data/shortlist.h
index 78cbccea2..395bcfee7 100644
--- a/src/data/shortlist.h
+++ b/src/data/shortlist.h
@@ -260,6 +260,14 @@ class LexicalShortlistGenerator : public ShortlistGenerator {
       for(auto& it : data_[i])
         indexSet.insert(it.first);
     }
+    // Ensure that the generated vocabulary items from a shortlist are a multiple-of-eight
+    // This is necessary until intgemm supports non-multiple-of-eight matrices.
+    // TODO better solution here? This could potentially be slow.
+    WordIndex i = static_cast<WordIndex>(firstNum_);
+    while (indexSet.size() % 8 != 0) {
+      indexSet.insert(i);
+      i++;
+    }
 
     // turn into vector and sort (selected indices)
     std::vector<WordIndex> indices(indexSet.begin(), indexSet.end());
diff --git a/src/embedder/embedder.h b/src/embedder/embedder.h
index 65807a51b..f2e4a10c0 100644
--- a/src/embedder/embedder.h
+++ b/src/embedder/embedder.h
@@ -84,11 +84,6 @@ class Embed : public ModelTask {
       auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
       graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
       graph->setDevice(device);
-      graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
-      if (device.type == DeviceType::cpu) {
-        graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
-      }
-
       graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
       graphs_.push_back(graph);
     }
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index f571f9f8b..3d42c6002 100644
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -7,7 +7,7 @@
 #include "graph/node_operators_tuple.h"
 
 #include "graph/auto_tuner.h"
-#include "tensors/cpu/int16.h"
+#include "tensors/cpu/intgemm_interface.h"
 #include "tensors/cpu/fbgemm/expanded_gemm.h"
 
 #if USE_FBGEMM
@@ -466,7 +466,6 @@ Expr weighted_average(Expr in, Expr weights, int ax) {
 
 Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
   auto device = a->graph()->getDeviceId().type;
-  float clipValue = a->graph()->getBackend()->getClip();
   // added support for packed GEMM API (fp16, int8)
   Type aElementType = a->value_type();
   Type bElementType = b->value_type();
@@ -475,18 +474,9 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
   // --optimize --cpu-thread=N with N > 0 are set.
   if(device == DeviceType::cpu) {
     if(isFloat(aElementType) && isFloat(bElementType)) {
-      if(a->graph()->getBackend()->isOptimized()) {
-        // dotInt16 computes A * B.T, hence the transpose for B to get A * B
-        // if transA = false and transB = false.
-
-        return cpu::int16::dot(
-          cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
-          cpu::int16::quantize(transB ? b : transpose(b), clipValue),
-          scale);
-      } else {
-        return Expression<DotNodeOp>(
-          clip(a, clipValue), clip(b, clipValue), transA, transB, scale);
-      }
+      return Expression<DotNodeOp>(a, b, transA, transB, scale);
+    } else if(isFloat(aElementType) && isIntgemm(bElementType)) {
+      return cpu::integer::affineOrDot(a, b, nullptr, transA, transB, scale);
     } else if(isFloat(aElementType) && isPacked(bElementType)) {
 #if USE_FBGEMM
       // 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2
@@ -496,7 +486,7 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
       // and this cpu lookup is executed only once and the state is kept in FBGEMM.
       if(fbgemm::fbgemmHasAvx2Support()) {
         // This variant of dot product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
-        return cpu::variant::dot(clip(a, clipValue),
+        return cpu::variant::dot(a,
                                  b,
                                  b->shape(),
                                  transA,
@@ -512,8 +502,7 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
       ABORT("Combination of types A: {} B: {} not supported", aElementType, bElementType);
     }
   } else {
-    return Expression<DotNodeOp>(
-        clip(a, clipValue), clip(b, clipValue), transA, transB, scale);
+    return Expression<DotNodeOp>(a, b, transA, transB, scale);
   }
 }
 
@@ -524,16 +513,9 @@ Expr bdot(Expr a, Expr b, bool transA, bool transB, float scale) {
 static Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
   // general version, MKL, CBlas or CUDA
 
-  // if clipValue > 0, the inputs will be clipped to range [-clipValue,
-  // clipValue] This is meant to keep values at the same range as used during
-  // training when optimizing for 8-bit integer products. Likely to be removed
-  // in the future when we explore better ways to handle this.
-  float clipValue = a->graph()->getBackend()->getClip();
-
   int rows = a->shape().elements() / a->shape()[-1];
   Expr ones = a->graph()->ones({ rows, 1 });
-  std::vector<Expr> nodes
-    = { clip(a, clipValue), clip(b, clipValue), bias, ones };
+  std::vector<Expr> nodes = { a, b, bias, ones };
   return Expression<AffineNodeOp>(nodes, transA, transB, scale);
 }
 
@@ -545,22 +527,14 @@ static Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, f
 Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
   auto device = a->graph()->getDeviceId().type;
 
-  float clipValue = a->graph()->getBackend()->getClip();
   Type aElementType = a->value_type();
   Type bElementType = b->value_type();
 
   if(device == DeviceType::cpu) {
     if(isFloat(aElementType) && isFloat(bElementType)) {
-      if(a->graph()->getBackend()->isOptimized()) {
-        // cpu int16 version
-        return cpu::int16::affine(
-          cpu::int16::quantize(transA ? transpose(a) : a, clipValue),
-          cpu::int16::quantize(transB ? b : transpose(b), clipValue),
-          bias,
-          scale);
-      } else {
-        return affineDefault(a, b, bias, transA, transB, scale);
-      }
+      return affineDefault(a, b, bias, transA, transB, scale);
+    } else if(isFloat(aElementType) && isIntgemm(bElementType)) {
+      return cpu::integer::affineOrDot(a, b, bias, transA, transB, scale);
     } else if(isFloat(aElementType) && isPacked(bElementType)) {
 #if USE_FBGEMM
       // 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2
@@ -570,7 +544,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
       // and this cpu lookup is executed only once and the state is kept in FBGEMM.
       if(fbgemm::fbgemmHasAvx2Support()) {
         // This variant of affine product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
-        return cpu::variant::affine(clip(a, clipValue),
+        return cpu::variant::affine(a,
                                     b,
                                     b->shape(),
                                     bias,
diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp
index 827c89828..4ecdf1697 100755
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@@ -10,7 +10,7 @@
 #include "translator/scorers.h"
 #include "data/alignment.h"
 #include "data/vocab_base.h"
-#include "tensors/cpu/fbgemm/expression_graph_packable.h"
+#include "tensors/cpu/expression_graph_packable.h"
 
 #if USE_FBGEMM
 #include "fbgemm/Utils.h"
@@ -256,7 +256,6 @@ bool convertModel(std::string inputFile, std::string outputFile, int32_t targetP
 
   auto graph = New<ExpressionGraphPackable>();
   graph->setDevice(CPU0);
-  graph->getBackend()->setOptimized(false);
 
   graph->load(inputFile);
   graph->forward();
diff --git a/src/rescorer/rescorer.h b/src/rescorer/rescorer.h
index 65644ccce..af0a16066 100644
--- a/src/rescorer/rescorer.h
+++ b/src/rescorer/rescorer.h
@@ -73,11 +73,6 @@ class Rescore : public ModelTask {
       auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
       graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
       graph->setDevice(device);
-      graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
-      if (device.type == DeviceType::cpu) {
-        graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
-      }
-
       graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
       graphs_.push_back(graph);
     }
diff --git a/src/tensors/backend.h b/src/tensors/backend.h
index ce8a5e603..160b828d3 100644
--- a/src/tensors/backend.h
+++ b/src/tensors/backend.h
@@ -10,10 +10,7 @@ class Backend {
   DeviceId deviceId_;
   size_t seed_;
   Ptr<RandomGenerator> randomGenerator_;
-
-  // global clipping value for matrix-multiplies, should soon be removed.
-  float clipValue_{0.f};
-
+  
 public:
   Backend(DeviceId deviceId, size_t seed)
       : deviceId_(deviceId), seed_(seed), randomGenerator_(createRandomGenerator(seed, deviceId)) {}
@@ -24,14 +21,6 @@ class Backend {
   // for GPU only, calls cudaSetDevice, does nothing on CPU. Maybe change name.
   virtual void setDevice() = 0;
   virtual void synchronize() = 0;
-
-  virtual void setClip(float clipValue) { clipValue_ = clipValue; }
-  float getClip() { return clipValue_; }
-
-  // for CPU, sets to use optimized code for inference.
-  // for GPU, this is invalid. for gpu, isOptimized() function always returns false.
-  virtual void setOptimized(bool optimize) = 0;
-  virtual bool isOptimized() = 0;
 };
 
 Ptr<Backend> BackendByDeviceId(DeviceId deviceId, size_t seed);
diff --git a/src/tensors/cpu/aligned.h b/src/tensors/cpu/aligned.h
new file mode 100644
index 000000000..969e08ae9
--- /dev/null
+++ b/src/tensors/cpu/aligned.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include "common/definitions.h"
+#include <stdlib.h>
+#ifdef _WIN32
+#include <malloc.h>
+#endif
+
+namespace marian {
+namespace cpu {
+namespace {
+
+// allocate function for tensor reserve() below. 
+// Alignment is needed because we use AVX512 and AVX2 vectors. We should fail if we can't allocate aligned memory.
+
+#ifdef _WIN32
+void *genericMalloc(size_t alignment, size_t size) {
+  void *ret = _aligned_malloc(size, alignment);
+  ABORT_IF(!ret, "Failed to allocate memory on CPU");
+  return ret;
+}
+void genericFree(void *ptr) {
+  _aligned_free(ptr);
+}
+#else
+// Linux and OS X.  There is no fallback to malloc because we need it to be aligned.
+void *genericMalloc(size_t alignment, size_t size) {
+  // On macos, aligned_alloc is available only on c++17
+  // Furthermore, it requires that the memory requested is an exact multiple of the alignment, otherwise it fails.
+  // posix_memalign is available both Mac (Since 2016) and Linux and in both gcc and clang
+  void *result;
+  // Error could be detected by return value or just remaining nullptr.
+  ABORT_IF(posix_memalign(&result, alignment, size), "Failed to allocate memory on CPU");
+  return result;
+}
+void genericFree(void *ptr) {
+  free(ptr);
+}
+#endif
+
+}
+} // namespace cpu
+} // namespace marian
diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h
index 74bbf8082..398e24240 100644
--- a/src/tensors/cpu/backend.h
+++ b/src/tensors/cpu/backend.h
@@ -10,17 +10,11 @@ namespace marian {
 namespace cpu {
 
 class Backend : public marian::Backend {
-protected:
-  bool optimized_{false};
-
 public:
   Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {}
   void setDevice() override {}
   void synchronize() override {}
-
-  // for CPU & inference only, sets to use optimized code for inference. Does nothing for GPU.
-  void setOptimized(bool optimize) override { optimized_ = optimize; }
-  bool isOptimized() override { return optimized_; }
 };
+
 }  // namespace cpu
 }  // namespace marian
diff --git a/src/tensors/cpu/device.cpp b/src/tensors/cpu/device.cpp
index 40bb558b1..fcc63b6a5 100644
--- a/src/tensors/cpu/device.cpp
+++ b/src/tensors/cpu/device.cpp
@@ -1,44 +1,8 @@
 #include "tensors/device.h"
+#include "tensors/cpu/aligned.h"
 #include <iostream>
-
-#ifdef _WIN32
-#include <malloc.h>
-#endif
-#include <stdlib.h>
-
 namespace marian {
 namespace cpu {
-namespace {
-
-// allocate function for tensor reserve() below. 
-// Alignment is needed because we use AVX512 and AVX2 vectors. We should fail if we can't allocate aligned memory.
-
-#ifdef _WIN32
-void *genericMalloc(size_t alignment, size_t size) {
-  void *ret = _aligned_malloc(size, alignment);
-  ABORT_IF(!ret, "Failed to allocate memory on CPU");
-  return ret;
-}
-void genericFree(void *ptr) {
-  _aligned_free(ptr);
-}
-#else
-// Linux and OS X.  There is no fallback to malloc because we need it to be aligned.
-void *genericMalloc(size_t alignment, size_t size) {
-  // On macos, aligned_alloc is available only on c++17
-  // Furthermore, it requires that the memory requested is an exact multiple of the alignment, otherwise it fails.
-  // posix_memalign is available both Mac (Since 2016) and Linux and in both gcc and clang
-  void *result;
-  // Error could be detected by return value or just remaining nullptr.
-  ABORT_IF(posix_memalign(&result, alignment, size), "Failed to allocate memory on CPU");
-  return result;
-}
-void genericFree(void *ptr) {
-  free(ptr);
-}
-#endif
-
-} // namespace
 
 Device::~Device() {
   genericFree(data_);
diff --git a/src/tensors/cpu/expression_graph_packable.h b/src/tensors/cpu/expression_graph_packable.h
new file mode 100644
index 000000000..689aa3b18
--- /dev/null
+++ b/src/tensors/cpu/expression_graph_packable.h
@@ -0,0 +1,266 @@
+#pragma once
+
+#include "graph/expression_graph.h"
+#include "fbgemm/packed_gemm.h"
+#include "tensors/cpu/integer_common.h"
+
+namespace marian {
+  namespace cpu {
+    void Transpose10(marian::Tensor out, const marian::Tensor in);
+  }
+}
+
+namespace marian {
+
+
+// When FBGEMM based packed GEMM is used, some weight matrices need to be packed offline.
+// The decision which weights can be packed or not should be done walking through the graph.
+// This requires some more changes, but we temporarily do this just by name ("_W") of the weights.
+// And, this introduces a low level packed_gemm.h apis interact with high level graph class.
+// So, we make a subclass of ExpressionGraph and put those immature codes in this class.
+// We will improve this in the near future. 
+class ExpressionGraphPackable : public ExpressionGraph {
+public:
+  ExpressionGraphPackable()
+    : ExpressionGraph( /* inference =  */ true) {} // Packable expression graph only supports inference
+
+  virtual ~ExpressionGraphPackable() {}
+
+  // Convert model weights into packed format and save to IO items.
+  // @TODO: review this
+  void packAndSave(const std::string& name, const std::string& meta, Type gemmElementType = Type::float32, Type saveElementType = Type::float32) {
+    std::vector<io::Item> ioItems;
+
+    // sorted by name in std::map
+    for (auto p : params()->getMap()) {
+      std::string pName = p.first;
+
+      if (!namespace_.empty()) {
+        if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::")
+          pName = pName.substr(namespace_.size() + 2);
+      }
+
+      Tensor val = p.second->val();
+
+      // save as packed format
+      // @TODO Hardcoded to find packable weights
+      // int8 - all the weights used for affine op and dot op
+      // fp16 - all the weights used for affine op
+      if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
+        && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) {
+#if USE_FBGEMM
+        using namespace marian::cpu::variant;
+        // packing information - size
+        int nrow;
+        int ncol;
+        uint64_t packsize;
+
+        fbgemmPacked8PackInfo(val->shape(),
+                              gemmElementType,
+                              pName.find("Wemb") != std::string::npos,
+                              nrow,
+                              ncol,
+                              packsize);
+
+        auto allocator = New<TensorAllocator>(getBackend());
+
+        // buffer tensor to save packed matrix
+        Tensor packedTensor;
+        allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
+
+        //Pack B matrix into int8
+        fbgemmPacked8Pack(packedTensor,
+                          val->data(),
+                          gemmElementType,
+                          pName.find("Wemb") != std::string::npos,
+                          nrow,
+                          ncol,
+                          packsize);
+        io::Item item;
+        item.name = pName;
+        item.shape = val->shape();
+        item.type = gemmElementType;
+
+        // Use the actual memory as this will be aligned and padded.
+        // When memory mapping this is required. Shape keeps track of
+        // tensor size. Saving to *.npz will cut to size.
+        auto mem = packedTensor->memory();
+        item.bytes.resize(mem->size());
+        copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
+
+        ioItems.emplace_back(std::move(item));
+#else
+        ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
+#endif
+      // fp16 quantization option
+      } else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) {
+#if USE_FBGEMM
+        using namespace marian::cpu::variant;
+
+        // packing information
+        int nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol;
+        uint64_t packsize;
+
+        fbgemmPacked16PackInfo(val->shape(),
+          false,
+          nrow,
+          ncol,
+          kernel_ncol_blocks,
+          brow,
+          bcol,
+          last_brow,
+          nbrow,
+          nbcol,
+          packsize);
+
+        auto allocator = New<TensorAllocator>(getBackend());
+
+        Tensor packedTensor;
+        allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
+
+        // fbgemmPacked16Pack
+        fbgemmPacked16Pack(packedTensor,
+          val->data(),
+          false,
+          nrow,
+          ncol,
+          kernel_ncol_blocks,
+          brow,
+          bcol,
+          last_brow,
+          nbrow,
+          nbcol,
+          packsize);
+        io::Item item;
+        item.name = pName;
+        item.shape = val->shape();
+        item.type = Type::packed16;
+
+        // Use the actual memory as this will be aligned and padded.
+        // When memory mapping this is required. Shape keeps track of
+        // tensor size. Saving to *.npz will cut to size.
+        auto mem = packedTensor->memory();
+        item.bytes.resize(mem->size());
+        copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
+
+        ioItems.emplace_back(std::move(item));
+#else
+        ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
+#endif
+      } else if (isIntgemm(gemmElementType) &&
+      (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2 /* || pName.find("Wemb") != std::string::npos*/)) {
+#if COMPILE_CPU
+        using cpu::integer::cols;
+        using cpu::integer::rows;
+        auto allocator = New<TensorAllocator>(getBackend());
+
+        Tensor paramMat; //This allocates extra 4 bytes at the end because of gemmElementType
+        allocator->allocate(paramMat, val->shape(), gemmElementType);
+
+        // Compute QuantMultiplier, compress matrix and store quantMult at the end.
+        // We need to tranpose first, because of our architecture independet format requiring a transposed matrix
+        Tensor tmp;
+        allocator->allocate(tmp, val->shape(), val->type());
+        cpu::Transpose10(tmp, val);
+  
+        if(sizeOf(gemmElementType) == 1) { // is 8-bit Intgemm type
+          float quantMult = cpu::integer::computeQuantMult<Type::intgemm8>(val);
+
+          // Hardware-specific conversions which allow to implement memory-mapping and avoid conversion at runtime
+          cpu::integer::passOrAbort(gemmElementType); // Check if the hardware supports the GEMM type
+          if(isSsse3(gemmElementType)) {
+            intgemm::ssse3::Kernels8::PrepareBTransposed(tmp->data(), /*input*/
+                                                    paramMat->data<int8_t>(), /*output*/
+                                                    quantMult, /*Quant Mult*/
+                                                    rows(val),
+                                                    cols(val));
+          } else if(isAvx2(gemmElementType)) {
+            intgemm::avx2::Kernels8::PrepareBTransposed(tmp->data(), /*input*/
+                                                   paramMat->data<int8_t>(), /*output*/
+                                                   quantMult, /*Quant Mult*/
+                                                   rows(val),
+                                                   cols(val));
+          } else if(isAvx512(gemmElementType)) {
+            intgemm::avx512bw::Kernels8::PrepareBTransposed(tmp->data(), /*input*/
+                                                     paramMat->data<int8_t>(), /*output*/
+                                                     quantMult, /*Quant Mult*/
+                                                     rows(val),
+                                                     cols(val));
+          } else {
+            ABORT_IF(gemmElementType != Type::intgemm8, "Type {} is not supported", gemmElementType); // shouldn't really happen, but let's make sure
+            intgemm::Int8::PrepareA(tmp->data(), /*input*/
+                                    paramMat->data<int8_t>(), /*output*/
+                                    quantMult, /*Quant Mult*/
+                                    rows(val),
+                                    cols(val));
+          }
+          //Put the quantMult at the back of the tensor
+          cpu::integer::getQuantMult<Type::intgemm8>(paramMat) = quantMult;
+
+        } else if(sizeOf(gemmElementType) == 2) { // is 16-bit Intgemm type
+          float quantMult = cpu::integer::computeQuantMult<Type::intgemm16>(val);
+
+           // Hardware-specific conversions which allow to implement memory-mapping and avoid conversion at runtime
+           cpu::integer::passOrAbort(gemmElementType); // Check if the hardware supports the GEMM type
+          if(isSse2(gemmElementType)) {
+            intgemm::sse2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/
+                                                    paramMat->data<int16_t>(), /*output*/
+                                                    quantMult, /*Quant Mult*/
+                                                    rows(val),
+                                                    cols(val));
+          } else if(isAvx2(gemmElementType)) {
+            intgemm::avx2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/
+                                                    paramMat->data<int16_t>(), /*output*/
+                                                    quantMult, /*Quant Mult*/
+                                                    rows(val),
+                                                    cols(val));
+          } else if(isAvx512(gemmElementType)) {
+            intgemm::avx512bw::Kernels16::PrepareBTransposed(tmp->data(), /*input*/
+                                                      paramMat->data<int16_t>(), /*output*/
+                                                      quantMult, /*Quant Mult*/
+                                                      rows(val),
+                                                      cols(val));
+          } else {
+            ABORT_IF(gemmElementType != Type::intgemm16, "Type {} is not supported", gemmElementType); // shouldn't really happen, but let's make sure
+            intgemm::Int16::PrepareA(tmp->data(), /*input*/
+                                     paramMat->data<int16_t>(), /*output*/
+                                     quantMult, /*Quant Mult*/
+                                     rows(val),
+                                     cols(val));
+          }
+          //Put the quantMult at the back of the tensor
+          cpu::integer::getQuantMult<Type::intgemm16>(paramMat) = quantMult;
+          
+        } else {
+          ABORT("Incorrect Intgemm type size: {}", sizeOf(gemmElementType));
+        }
+
+        //Save... Same as the fbgemm case
+        io::Item item;
+        item.name = pName;
+        item.shape = val->shape();
+        item.type = gemmElementType;
+
+        auto mem = paramMat->memory();
+        item.bytes.resize(mem->size());
+        copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
+        ioItems.emplace_back(std::move(item));
+#else
+        ABORT("Packed type {} only supported when compiled with -DCOMPILE_CPU=on", gemmElementType);
+#endif
+      } else {
+        ABORT_IF(saveElementType != Type::float32, "We currently do not know how to save matrices as {}", saveElementType);
+        io::Item item;
+        val->get(item, pName);
+        item.convert(saveElementType);
+        ioItems.emplace_back(std::move(item));
+      }
+    }
+
+    if (!meta.empty())
+      io::addMetaToItems(meta, "special:model.yml", ioItems);
+    io::saveItems(name, ioItems);
+  }
+};
+
+}  // namespace marian
diff --git a/src/tensors/cpu/fbgemm/expanded_gemm.h b/src/tensors/cpu/fbgemm/expanded_gemm.h
index a5c93f6bc..fb07bbad5 100644
--- a/src/tensors/cpu/fbgemm/expanded_gemm.h
+++ b/src/tensors/cpu/fbgemm/expanded_gemm.h
@@ -2,7 +2,7 @@
 
 #include "graph/node.h"
 #include "packed_gemm.h"
-#include "tensors/cpu/sharp/int_gemm.h"
+#include "tensors/cpu/integer_common.h"
 
 #if USE_FBGEMM
 #ifdef __GNUC__
@@ -57,14 +57,12 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp {
   int nbcol_;
   uint64_t packsize_;
 
-  FbgemmPacked16PackNodeOp(Expr a, PackMatrix packMat, bool transpose, float clipValue)
+  FbgemmPacked16PackNodeOp(Expr a, PackMatrix packMat, bool transpose)
       : UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
         packMat_(packMat),
         transpose_(transpose) {
     if(packMat != PackMatrix::B)
       ABORT("Only prepacking of B (weight matrix) is supported");
-    if(clipValue != 0)
-      ABORT("Clipping is not supported");
     if(!memoize_)
       ABORT("Only constant weight node can be packed");
   }
@@ -144,16 +142,13 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
   FbgemmPacked8PackNodeOp(Expr a,
                           PackMatrix packMat,
                           marian::Type packType,
-                          bool transpose,
-                          float clipValue)
+                          bool transpose)
       : UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
         packMat_(packMat),
         packType_(packType),
         transpose_(transpose) {
     if(packMat != PackMatrix::B)
       ABORT("Only prepacking of B (weight matrix) is supported");
-    if(clipValue != 0)
-      ABORT("Clipping is not supported");
     if(!memoize_)
       ABORT("Only constant weight node can be packed");
   }
@@ -337,7 +332,7 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
                                            k_,
                                            transA_,
                                            transB_);
-                       marian::cpu::int16::AddBias(val_, child(2)->val())) };
+                       marian::cpu::integer::AddBias(val_, child(2)->val())) };
     } else {
       nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
                                            child(0)->val(),
@@ -377,11 +372,11 @@ static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, boo
   }
 }
 
-static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float clipValue) {
+static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose) {
   if (elementType == Type::packed16)
-    return Expression<FbgemmPacked16PackNodeOp>(a, packMat, transpose, clipValue);
+    return Expression<FbgemmPacked16PackNodeOp>(a, packMat, transpose);
   else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, clipValue);
+    return Expression<FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose);
   else {
     ABORT("Only int8 and fp16 are available. {}", elementType);
     return nullptr;
diff --git a/src/tensors/cpu/fbgemm/expression_graph_packable.h b/src/tensors/cpu/fbgemm/expression_graph_packable.h
deleted file mode 100644
index f5b05c302..000000000
--- a/src/tensors/cpu/fbgemm/expression_graph_packable.h
+++ /dev/null
@@ -1,156 +0,0 @@
-#pragma once
-
-#include "graph/expression_graph.h"
-#include "packed_gemm.h"
-
-namespace marian {
-
-// When FBGEMM based packed GEMM is used, some weight matrices need to be packed offline.
-// The decision which weights can be packed or not should be done walking through the graph.
-// This requires some more changes, but we temporarily do this just by name ("_W") of the weights.
-// And, this introduces a low level packed_gemm.h apis interact with high level graph class.
-// So, we make a subclass of ExpressionGraph and put those immature codes in this class.
-// We will improve this in the near future. 
-class ExpressionGraphPackable : public ExpressionGraph {
-public:
-  ExpressionGraphPackable()
-    : ExpressionGraph( /* inference =  */ true) {} // Packable expression graph only supports inference
-
-  virtual ~ExpressionGraphPackable() {}
-
-  // Convert model weights into packed format and save to IO items.
-  // @TODO: review this
-  void packAndSave(const std::string& name, const std::string& meta, Type gemmElementType = Type::float32, Type saveElementType = Type::float32) {
-    std::vector<io::Item> ioItems;
-
-    // sorted by name in std::map
-    for (auto p : params()->getMap()) {
-      std::string pName = p.first;
-
-      if (!namespace_.empty()) {
-        if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::")
-          pName = pName.substr(namespace_.size() + 2);
-      }
-
-      Tensor val = p.second->val();
-
-      // save as packed format
-      // @TODO Hardcoded to find packable weights
-      // int8 - all the weights used for affine op and dot op
-      // fp16 - all the weights used for affine op
-      if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512)
-        && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) {
-#if USE_FBGEMM
-        using namespace marian::cpu::variant;
-        // packing information - size
-        int nrow;
-        int ncol;
-        uint64_t packsize;
-
-        fbgemmPacked8PackInfo(val->shape(),
-                              gemmElementType,
-                              pName.find("Wemb") != std::string::npos,
-                              nrow,
-                              ncol,
-                              packsize);
-
-        auto allocator = New<TensorAllocator>(getBackend());
-
-        // buffer tensor to save packed matrix
-        Tensor packedTensor;
-        allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
-
-        //Pack B matrix into int8
-        fbgemmPacked8Pack(packedTensor,
-                          val->data(),
-                          gemmElementType,
-                          pName.find("Wemb") != std::string::npos,
-                          nrow,
-                          ncol,
-                          packsize);
-        io::Item item;
-        item.name = pName;
-        item.shape = val->shape();
-        item.type = gemmElementType;
-
-        // Use the actual memory as this will be aligned and padded.
-        // When memory mapping this is required. Shape keeps track of
-        // tensor size. Saving to *.npz will cut to size.
-        auto mem = packedTensor->memory();
-        item.bytes.resize(mem->size());
-        copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
-
-        ioItems.emplace_back(std::move(item));
-#else
-        ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
-#endif
-      // fp16 quantization option
-      } else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) {
-#if USE_FBGEMM
-        using namespace marian::cpu::variant;
-
-        // packing information
-        int nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol;
-        uint64_t packsize;
-
-        fbgemmPacked16PackInfo(val->shape(),
-          false,
-          nrow,
-          ncol,
-          kernel_ncol_blocks,
-          brow,
-          bcol,
-          last_brow,
-          nbrow,
-          nbcol,
-          packsize);
-
-        auto allocator = New<TensorAllocator>(getBackend());
-
-        Tensor packedTensor;
-        allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8);
-
-        // fbgemmPacked16Pack
-        fbgemmPacked16Pack(packedTensor,
-          val->data(),
-          false,
-          nrow,
-          ncol,
-          kernel_ncol_blocks,
-          brow,
-          bcol,
-          last_brow,
-          nbrow,
-          nbcol,
-          packsize);
-        io::Item item;
-        item.name = pName;
-        item.shape = val->shape();
-        item.type = Type::packed16;
-
-        // Use the actual memory as this will be aligned and padded.
-        // When memory mapping this is required. Shape keeps track of
-        // tensor size. Saving to *.npz will cut to size.
-        auto mem = packedTensor->memory();
-        item.bytes.resize(mem->size());
-        copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
-
-        ioItems.emplace_back(std::move(item));
-#else
-        ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType);
-#endif
-      } else {
-        io::Item item;
-        val->get(item, pName);
-        item.convert(saveElementType);
-        ioItems.emplace_back(std::move(item));
-      }
-    }
-
-    if (!meta.empty())
-      io::addMetaToItems(meta, "special:model.yml", ioItems);
-    io::saveItems(name, ioItems);
-  }
-};
-
-}  // namespace marian
\ No newline at end of file
diff --git a/src/tensors/cpu/int16.h b/src/tensors/cpu/int16.h
deleted file mode 100644
index f2bdd0a91..000000000
--- a/src/tensors/cpu/int16.h
+++ /dev/null
@@ -1,113 +0,0 @@
-#pragma once
-
-#include "graph/node.h"
-#include "tensors/cpu/sharp/int_gemm.h"
-
-namespace marian {
-namespace cpu {
-namespace int16 {
-
-struct QuantizeNodeOp : public UnaryNodeOp {
-  float clipValue_;
-
-  QuantizeNodeOp(Expr a, float clipValue)
-      : UnaryNodeOp(a, Type::int16), clipValue_{clipValue} {}
-
-  NodeOps forwardOps() override {
-    return {NodeOp(Quantize16(val_, child(0)->val(), clipValue_))};
-  }
-
-  NodeOps backwardOps() override {
-    ABORT("Only used for inference");
-  }
-
-  const std::string type() override { return "quantizeInt16"; }
-};
-
-class DotNodeOp : public NaryNodeOp {
-private:
-  float scalar_;
-
-public:
-  DotNodeOp(Expr a, Expr b, float scalar)
-      : NaryNodeOp({a, b}, newShape(a, b), Type::float32), scalar_(scalar) {}
-
-  Shape newShape(Expr a, Expr b) {
-    auto shapeA = a->shape();
-    auto shapeB = b->shape();
-
-    // Computing A * B^T
-    shapeB.set(-2, b->shape()[-1]);
-    shapeB.set(-1, b->shape()[-2]);
-
-    Shape outShape = shapeA;
-    outShape.set(-1, shapeB[-1]);
-    ABORT_IF(shapeA[-1] != shapeB[-2],
-             "matrix product requires dimensions to match");
-    return outShape;
-  }
-
-  NodeOps forwardOps() override {
-    return {NodeOp(ProdInt16(val_, child(0)->val(), child(1)->val(), scalar_))};
-  }
-
-  NodeOps backwardOps() override {
-    ABORT("Only used for inference");
-  }
-
-  const std::string type() override { return "dotInt16"; }
-};
-
-class AffineNodeOp : public NaryNodeOp {
-private:
-  float scalar_;
-
-public:
-  AffineNodeOp(const std::vector<Expr>& nodes, float scalar)
-      : NaryNodeOp(nodes, newShape(nodes[0], nodes[1]), Type::float32), scalar_(scalar) {}
-
-  Shape newShape(Expr a, Expr b) {
-    auto shapeA = a->shape();
-    auto shapeB = b->shape();
-
-    // Computing A * B^T
-    shapeB.set(-2, b->shape()[-1]);
-    shapeB.set(-1, b->shape()[-2]);
-
-    Shape outShape = shapeA;
-    outShape.set(-1, shapeB[-1]);
-    ABORT_IF(shapeA[-1] != shapeB[-2],
-             "matrix product requires dimensions to match");
-    return outShape;
-  }
-
-  NodeOps forwardOps() override {
-    return {
-      NodeOp(ProdInt16(val_, child(0)->val(), child(1)->val(), scalar_);
-             AddBias(val_, child(2)->val()))
-    };
-  }
-
-  NodeOps backwardOps() override {
-    ABORT("Only used for inference");
-  }
-
-  const std::string type() override { return "affineInt16"; }
-};
-
-static inline Expr dot(Expr a, Expr b, float scalar) {
-  return Expression<cpu::int16::DotNodeOp>(a, b, scalar);
-}
-
-static inline Expr affine(Expr a, Expr b, Expr c, float scalar) {
-  std::vector<Expr> nodes = {a, b, c};
-  return Expression<cpu::int16::AffineNodeOp>(nodes, scalar);
-}
-
-static inline Expr quantize(Expr a, float clipValue) {
-  return Expression<cpu::int16::QuantizeNodeOp>(a, clipValue);
-}
-
-}  // namespace int16
-}  // namespace cpu
-}  // namespace marian
diff --git a/src/tensors/cpu/integer_common.cpp b/src/tensors/cpu/integer_common.cpp
new file mode 100644
index 000000000..6864a86d6
--- /dev/null
+++ b/src/tensors/cpu/integer_common.cpp
@@ -0,0 +1,45 @@
+#include "integer_common.h"
+
+namespace marian {
+namespace cpu {
+namespace integer {
+// This operates on floats after processing so doesn't care about int8_t vs int16_t.
+void AddBias(marian::Tensor C, const marian::Tensor Bias) {
+  float* y = C->data();
+  const float* x = C->data();
+  const float* bias = Bias->data();
+
+  const int m = C->shape().elements() / C->shape()[-1];
+  const int n = C->shape()[-1];
+
+  for(int j = 0; j < m; ++j) {
+    int i = 0;
+#ifdef __AVX512F__
+    int n16 = n & ~15;
+    for(; i < n16; i += 16) {
+      __m512 ai = _mm512_loadu_ps(x + j * n + i);
+      __m512 bi = _mm512_loadu_ps(bias + i);
+      __m512 yi = _mm512_add_ps(ai, bi);
+      _mm512_storeu_ps(y + j * n + i, yi);
+    }
+#else
+    int n4 = (n / 4) * 4;
+    for(; i < n4; i += 4) {
+      __m128 ai = _mm_loadu_ps(x + j * n + i);
+      __m128 bi = _mm_loadu_ps(bias + i);
+      __m128 yi = _mm_add_ps(ai, bi);
+      _mm_storeu_ps(y + j * n + i, yi);
+    }
+#endif
+    for(; i < n; i++) {
+      y[j * n + i] = x[j * n + i] + bias[i];
+    }
+  }
+}
+
+//template void prepareAndTranspose<intgemm8>;//(io::Item& item, const char * input);
+//template void prepareAndTranspose<intgemm16>(io::Item&, const char *);
+
+} //integer
+} //cpu
+} //marian
\ No newline at end of file
diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h
new file mode 100644
index 000000000..6e96f8cdc
--- /dev/null
+++ b/src/tensors/cpu/integer_common.h
@@ -0,0 +1,223 @@
+#pragma once
+
+#include "tensors/tensor_allocator.h"
+#include "tensors/tensor_operators.h"
+#include "tensors/cpu/aligned.h"
+#include "common/io_item.h"
+
+#if COMPILE_CPU
+#include "3rd_party/intgemm/intgemm/intgemm.h"
+#else
+namespace intgemm {
+  struct Int8;
+  struct Int16;
+  namespace ssse3 {
+    struct Kernels8;
+  }
+  namespace sse2 {
+    struct Kernels16;
+  }
+  namespace avx2 {
+    struct Kernels8;
+    struct Kernels16;
+  }
+  namespace avx512bw {
+    struct Kernels8;
+    struct Kernels16;
+  }
+  namespace avx512vnni {
+    struct Kernels8;
+  }
+}
+#endif
+
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
+#include <cassert>
+#include <cstddef>
+
+namespace marian {
+namespace cpu {
+namespace integer {
+
+//Convenient function to get rows and columns of a tensor, shadowed by namespace.
+inline int cols(Tensor& tensor) { return tensor->shape()[-1]; }
+inline int rows(Tensor& tensor) { return tensor->shape().elements() / cols(tensor); }
+
+inline int cols(Shape& shape) { return shape[-1]; }
+inline int rows(Shape& shape) { return shape.elements() / cols(shape); }
+
+template <Type type> struct intgemm_;
+
+template <> struct intgemm_<Type::intgemm8> {
+  using width = intgemm::Int8;
+  using type = int8_t;
+};
+
+template <> struct intgemm_<Type::intgemm8ssse3> {
+  using width = intgemm::ssse3::Kernels8;
+  using type = int8_t;
+};
+
+template <> struct intgemm_<Type::intgemm8avx2> {
+  using width = intgemm::avx2::Kernels8;
+  using type = int8_t;
+};
+
+template <> struct intgemm_<Type::intgemm8avx512> {
+  using width = intgemm::avx512bw::Kernels8;
+  using type = int8_t;
+};
+
+template <> struct intgemm_<Type::intgemm8avx512vnni> {
+  using width = intgemm::avx512vnni::Kernels8;
+  using type = int8_t;
+};
+
+template <> struct intgemm_<Type::intgemm16> {
+  using width = intgemm::Int16;
+  using type = int16_t;
+};
+
+template <> struct intgemm_<Type::intgemm16sse2> {
+  using width = intgemm::sse2::Kernels16;
+  using type = int16_t;
+};
+
+template <> struct intgemm_<Type::intgemm16avx2> {
+  using width = intgemm::avx2::Kernels16;
+  using type = int16_t;
+};
+
+template <> struct intgemm_<Type::intgemm16avx512> {
+  using width = intgemm::avx512bw::Kernels16;
+  using type = int16_t;
+};
+
+template <Type vtype>
+static inline float& getQuantMult(marian::Tensor val) {
+#if COMPILE_CPU
+  ABORT_IF(!isIntgemm(val->type()), "getQuantMult does not work for type {}", val->type());
+  typedef typename intgemm_<vtype>::type Integer;
+  return *(reinterpret_cast<float*>(val->data<Integer>() + val->shape().elements()));
+#else
+  val;
+  ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON.");
+#endif
+}
+
+static inline Type getIntgemmType(Type vtype) {
+#if COMPILE_CPU
+  if (vtype == Type::intgemm8) {
+    if (intgemm::kCPU == intgemm::CPUType::AVX512VNNI) {
+      return Type::intgemm8avx512vnni;
+    } else if (intgemm::kCPU == intgemm::CPUType::AVX512BW) {
+      return Type::intgemm8avx512;
+    } else if (intgemm::kCPU == intgemm::CPUType::AVX2) {
+      return Type::intgemm8avx2;
+    } else if (intgemm::kCPU == intgemm::CPUType::SSSE3) {
+      return Type::intgemm8ssse3;
+    } else {
+      ABORT("Your CPU doesn't support SSSE3, necessary for 8bit intgemm to work.");
+    }
+  } else if (vtype == Type::intgemm16) {
+    if (intgemm::kCPU > intgemm::CPUType::AVX2) {
+      return Type::intgemm16avx512;
+    } else if (intgemm::kCPU == intgemm::CPUType::AVX2) {
+      return Type::intgemm16avx2;
+    } else if (intgemm::kCPU >= intgemm::CPUType::SSE2) {
+      return Type::intgemm16sse2;
+    } else {
+      ABORT("Your CPU doesn't support SSE2, necessary for 16bit intgemm to work.");
+    }
+  } else {
+    ABORT("Unrecognised type {}.", vtype);
+  }
+#else
+  ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON.");
+  return vtype;
+#endif
+}
+
+static inline bool passOrAbort(Type vtype) {
+#if COMPILE_CPU
+  if (vtype == Type::intgemm8 || vtype == Type::intgemm16) {
+    return true;
+  } else if (vtype == Type::intgemm16sse2) {
+    ABORT_IF(intgemm::kCPU < intgemm::CPUType::SSE2, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype);
+  } else if (vtype == Type::intgemm8ssse3) {
+    ABORT_IF(intgemm::kCPU < intgemm::CPUType::SSSE3, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype);
+  } else if (vtype == Type::intgemm8avx2 || vtype == Type::intgemm16avx2) {
+    ABORT_IF(intgemm::kCPU < intgemm::CPUType::AVX2, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype);
+  } else if (vtype == Type::intgemm8avx512 || vtype == Type::intgemm16avx512) {
+    ABORT_IF(intgemm::kCPU < intgemm::CPUType::AVX512BW, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype);
+  } else if (vtype == Type::intgemm8avx512vnni) {
+    ABORT_IF(intgemm::kCPU < intgemm::CPUType::AVX512VNNI, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype);
+  }
+  return true;
+#else
+  vtype;
+  ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON.");
+  return false;
+#endif
+}
+
+template <Type vtype>
+static inline float computeQuantMult(marian::Tensor val) {
+#if COMPILE_CPU
+  if(sizeOf(vtype) == 1)
+    return 127.0f / intgemm::MaxAbsolute(val->data(), val->data() + val->shape().elements());
+  else if(sizeOf(vtype) == 2)
+    return 1024.0f;
+  else
+    ABORT("Unhandled type size {}", sizeOf(vtype));
+#else
+  val; 
+  ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON.");
+#endif
+}
+
+// This operates on floats after processing so doesn't care about int8_t vs int16_t.
+void AddBias(marian::Tensor C, const marian::Tensor Bias);
+
+// For loading architecture agnostic models. We do PrepareAndTranpose, because we already transposed
+// in our binary format. Then we copy the quantizationMultiplier information at the end
+template<Type vtype>
+void prepareAndTransposeB(io::Item& item, const char * input) {
+#if COMPILE_CPU
+    typedef typename intgemm_<vtype>::type Integer;
+    Integer * output_tensor = reinterpret_cast<Integer *>(&(*item.bytes.begin()));
+    // Sometimes we will end up with misaligned intput (and output) so we can't use them directly.
+    // If this is the case, we will need to temporary allocate aligned memory, copy the results, and then free it
+    if (reinterpret_cast<uintptr_t>(input) % 64 == 0 && reinterpret_cast<uintptr_t>(output_tensor) % 64 == 0) {
+        intgemm_<vtype>::width::PrepareBQuantizedTransposed(reinterpret_cast<const Integer *>(input),
+                                                   output_tensor,
+                                                   rows(item.shape),  //Since we only transposed, but didn't update the shape when constructing the binary, 
+                                                   cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows
+    } else {
+        Integer * aligned_input = reinterpret_cast<Integer *>(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer)));
+        std::copy(input, input + rows(item.shape)*cols(item.shape), aligned_input);
+        Integer * aligned_output = reinterpret_cast<Integer *>(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer)));
+        intgemm_<vtype>::width::PrepareBQuantizedTransposed(reinterpret_cast<const Integer *>(aligned_input),
+                                                   reinterpret_cast<Integer *>(aligned_output),
+                                                   rows(item.shape),  //Since we only transposed, but didn't update the shape when constructing the binary, 
+                                                   cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows
+        // Copy to output tensor
+        std::copy(aligned_output, aligned_output + rows(item.shape)*cols(item.shape), output_tensor);
+        genericFree(aligned_input);
+        genericFree(aligned_output);
+    }
+    //Copy the quantMult
+    float quantMult = *(reinterpret_cast<const float *>(reinterpret_cast<const Integer *>(input) + item.shape.elements()));
+    *(reinterpret_cast<float *>(&(*(output_tensor + item.shape.elements())))) = quantMult;
+#else
+    item, input;
+    ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON.");
+#endif
+}
+
+} //integer
+} //cpu
+} //marian
\ No newline at end of file
diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h
new file mode 100644
index 000000000..88408aa18
--- /dev/null
+++ b/src/tensors/cpu/intgemm_interface.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#include "graph/node.h"
+#include "graph/node_operators_unary.h"
+#include "integer_common.h"
+
+namespace marian {
+
+namespace cpu {
+namespace integer {
+
+#if COMPILE_CPU
+/*
+ * Prepare an activation matrix into intgemm8/16 format. For now the activation matrix is just quantized.
+ * Expr input: The input tensor
+ */
+template<Type vtype>
+static inline Expr prepareA(Expr a) {
+  auto nodeOp = [](Expr out, const std::vector<Expr>& children) {
+    Expr in = children[0];
+    auto quantMult = computeQuantMult<vtype>(in->val());
+    typedef typename intgemm_<vtype>::type Integer;
+    intgemm_<vtype>::width::PrepareA(in->val()->data(), /*input*/
+                                     out->val()->data<Integer>(), /*output*/
+                                     quantMult, /*Quant Mult*/
+                                     rows(in->val()),
+                                     cols(in->val()));
+    getQuantMult<vtype>(out->val()) = quantMult;
+  };
+
+  return lambda({a}, a->shape(), vtype, nodeOp);
+}
+#endif
+
+/*	
+ * This computes A*B (+ bias if available) in intgemm.	
+ * Expr a: The activation matrix in intgemm format	
+ * Expr b: The parameter matrix in intgemm fromat	
+ * Expr bias: The bias	
+ * bool transA - tranpose input A if true
+ * bool transB - unused here (@TODO remove?)
+ * float scale - scale the output by `scale`
+ * the template argument controls whether we're doing 16bit integers or 8bit integers. 
+ * It can be Type::intgemm8 or Type::intgemm16 and all hardware-specific variants	
+ */
+template<Type vtype>
+static inline Expr affineOrDotTyped(Expr a, Expr bQuant, Expr bias, bool transA, bool /*transB*/, float scale) {
+#if COMPILE_CPU
+  ABORT_IF(!isFloat(a->value_type()), "Intgemm expects type of A to be float32 not {}", a->value_type());
+  ABORT_IF(!isIntgemm(bQuant->value_type()), "Intgemm expects type of B to be a variant of intgemm not {}", bQuant->value_type());
+
+  auto aQuant = prepareA<vtype>(transA ? transpose(a) : a); // A should not be quantized yet as seen above, hence quantize here
+  
+  // determine the output shape m x n for A: m x k and B: k x n
+  // since we transpose A beforehand we don't need to take care of transposed shapes here 
+  Shape outShape = aQuant->shape();
+  outShape.set(-1, bQuant->shape()[-1]);
+
+  // wrap the multiply finctions to be executed in the forward step of a Lambda node
+  auto dotOrAffineNodeOp = [=](Expr out, const std::vector<Expr>& children) {
+    Expr aQuant = children[0];
+    Expr bQuant = children[1];
+    Expr bias   = children.size() > 2 ? children[2] : nullptr;
+
+    // when we arrive here, A and B are already quantized, so just get the multipliers
+    float aQuantMult = getQuantMult<vtype>(aQuant->val());
+    float bQuantMult = getQuantMult<vtype>(bQuant->val());
+        
+    float unquant_mult = 1.0f / (aQuantMult * bQuantMult);
+    unquant_mult = unquant_mult * scale;
+
+    typedef typename intgemm_<vtype>::type Integer;
+    if(bias) { // dispatch a multiply with integrated bias addition i.e affine(...)
+      intgemm_<vtype>::width::Multiply(/*A=*/aQuant->val()->data<Integer>(),
+                                       /*B=*/bQuant->val()->data<Integer>(),
+                                       rows(aQuant->val()),
+                                       cols(aQuant->val()),
+                                       cols(bQuant->val()),
+                                       intgemm::callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, /*bias=*/bias->val()->data(), /*output=*/out->val()->data()));
+    } else { // dispatch a multiply without bias addition i.e dot(...)
+      intgemm_<vtype>::width::Multiply(/*A=*/aQuant->val()->data<Integer>(),
+                                       /*B=*/bQuant->val()->data<Integer>(),
+                                       rows(aQuant->val()),
+                                       cols(aQuant->val()),
+                                       cols(bQuant->val()),
+                                       intgemm::callbacks::UnquantizeAndWrite(unquant_mult, /*output=*/out->val()->data()));
+    }
+  };
+
+  std::vector<Expr> children = {aQuant, bQuant};
+  if(bias)
+    children.push_back(bias);
+
+  return lambda(children, outShape, Type::float32, dotOrAffineNodeOp); // inference-only Lambda node
+#else
+  a, bQuant, bias, transA, scale;
+  ABORT("You need to enable CPU compilation to use this feature. Use cmake .. -DCOMPILE_CPU=ON");
+#endif
+}
+
+// Dispatch correct hardware-agnostic or hardware-specific matrix multiplies
+static inline Expr affineOrDot(Expr a, Expr bQuant, Expr bias, bool transA, bool transB, float scale) {
+  Type bQuantElementType = bQuant->value_type();
+  static const bool pass = cpu::integer::passOrAbort(bQuantElementType);
+  pass; // We declare this variable as static so that passOrAbort is only ever run once during the initialization.
+  switch(bQuantElementType) {
+    //case Type::intgemm8 :  // The generic case selects CPU automatically, but we set all the types manually anyways.
+    //  return cpu::integer::affineOrDotTyped<Type::intgemm8>(a, bQuant, bias, transA, transB, scale);    
+    case Type::intgemm8ssse3 :
+      return cpu::integer::affineOrDotTyped<Type::intgemm8ssse3>(a, bQuant, bias, transA, transB, scale);
+    case Type::intgemm8avx2 :
+      return cpu::integer::affineOrDotTyped<Type::intgemm8avx2>(a, bQuant, bias, transA, transB, scale);
+    case Type::intgemm8avx512 :
+      return cpu::integer::affineOrDotTyped<Type::intgemm8avx512>(a, bQuant, bias, transA, transB, scale);
+    case Type::intgemm8avx512vnni :
+      return cpu::integer::affineOrDotTyped<Type::intgemm8avx512vnni>(a, bQuant, bias, transA, transB, scale);
+    //case Type::intgemm16 :  // The generic case selects CPU automatically, but we set all the types manually anyways.
+    //  return cpu::integer::affineOrDotTyped<Type::intgemm16>(a, bQuant, bias, transA, transB, scale);
+    case Type::intgemm16sse2 :
+      return cpu::integer::affineOrDotTyped<Type::intgemm16sse2>(a, bQuant, bias, transA, transB, scale);
+    case Type::intgemm16avx2 :
+      return cpu::integer::affineOrDotTyped<Type::intgemm16avx2>(a, bQuant, bias, transA, transB, scale);
+    case Type::intgemm16avx512 :
+      return cpu::integer::affineOrDotTyped<Type::intgemm16avx512>(a, bQuant, bias, transA, transB, scale);
+    default:
+      ABORT("Unsupported type {} for Intgemm type??", bQuantElementType);
+  }
+}
+
+}  // namespace integer
+}  // namespace cpu
+}  // namespace marian
diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp
index 8529db8b5..86b87b033 100755
--- a/src/tensors/cpu/prod.cpp
+++ b/src/tensors/cpu/prod.cpp
@@ -7,8 +7,17 @@
 #include "tensors/tensor.h"
 #include "tensors/tensor_allocator.h"
 
+#if MKL_FOUND
+#include <mkl.h>
+#else
+#if BLAS_FOUND
+#include <cblas.h>
+#endif
+#endif
+
+#include "integer_common.h"
 #include "prod_blas.h"
-#include "sharp/int_gemm.h"
+
 
 namespace marian {
 
@@ -187,7 +196,7 @@ void ProdWithBias(marian::Tensor C,
                   float beta,
                   float scalar) {
   cpu::Prod(C, A, B, transA, transB, beta, scalar);
-  cpu::int16::AddBias(C, bias);
+  cpu::integer::AddBias(C, bias);
 }
 
 void CSRProd(marian::Tensor C,
diff --git a/src/tensors/cpu/sharp/avx_gemm.cpp b/src/tensors/cpu/sharp/avx_gemm.cpp
deleted file mode 100644
index 61f75fea6..000000000
--- a/src/tensors/cpu/sharp/avx_gemm.cpp
+++ /dev/null
@@ -1,615 +0,0 @@
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <math.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <tmmintrin.h>
-#include <xmmintrin.h>
-#include <cassert>
-#include <cstddef>
-
-#ifdef __AVX512F__
-
-namespace marian {
-namespace cpu {
-namespace int16 {
-
-namespace {
-// Load from memory, multiply, and convert to int32_t.
-inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) {
-  // Load 16 floats
-  __m512 val = _mm512_load_ps(input);
-  // Multiply each by the quantization factor.
-  val = _mm512_mul_ps(val, quant_mult_reg);
-  // Cast to 32-bit int
-  return _mm512_cvtps_epi32(val);
-}
-}  // namespace
-
-// Convert
-void AVX_Quantize16(const float *input,
-                    int16_t *output,
-                    float quant_mult,
-                    std::size_t size) {
-  assert(size % 16 == 0);
-  assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
-  // Fill with the quantization multiplier.
-  const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
-  const float *end = input + size;
-  for(; input != end; input += 16, output += 16) {
-    // There doesn't seem to be an unmasked version.
-    _mm512_mask_cvtsepi32_storeu_epi16(
-        output, 0xffff, QuantizerGrab(input, quant_mult_reg));
-  }
-}
-
-void AVX_Quantize8(const float *input,
-                   int8_t *output,
-                   float quant_mult,
-                   std::size_t size) {
-  assert(size % 16 == 0);
-  assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
-  const __m512i neg127 = _mm512_set1_epi32(-127);
-  const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
-  const float *end = input + size;
-  for(; input < end; input += 16, output += 16) {
-    __m512i asint = QuantizerGrab(input, quant_mult_reg);
-    /* Ban -128. We can't negate it.
-     * The largest possbile product is -128 * -128 = 2^14. If two of those are
-     * summed that's 2^15 which is too large for int16_t. By banning -128 we
-     * can accumulate two in int16_t w/o saturation before going to int32_t.
-     * But this is ok because apparently the instruction will saturate.
-     */
-    asint = _mm512_max_epi32(asint, neg127);
-    // There doesn't seem to be an unmasked version.
-    _mm512_mask_cvtsepi32_storeu_epi8(output, 0xffff, asint);
-  }
-}
-
-namespace {
-
-union FloatAccess {
-  float as_f[4];
-  __m128 as_n;
-};
-union IntAccess {
-  int32_t as_i[4];
-  __m128i as_n;
-};
-
-/* Convert 16-bit to 32-bit and add, not caring what parts are added.
- * Implementations:
- * 1.
- * https://github.com/tesseract-ocr/tesseract/blob/master/src/arch/intsimdmatrixavx2.cpp#L67
- * under Apache license: This does a multiply by 1 and horizontal add:
- *    _mm512_madd_epi16(sum, _mm512_set1_epi16(1))
- *   Current fastest.
- *
- * 2. Signed extension and fold halves:
- *    sum = _mm512_add_epi32(
- *      _mm512_cvtepi16_epi32(_mm512_castsi512_si256(sum)),
- *      _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(sum, 1)));
- *
- * 3. Sign extend by abuse of bitshift, then add.
- *   __m128i shift16 = _mm_set_epi32(0,0,0,16);
- *   sum = _mm512_add_epi32(
- *       _mm512_sra_epi32(_mm512_sll_epi32(sum, shift16), shift16),
- *       _mm512_sra_epi32(sum, shift16));
- */
-inline void Convert32Sum(__m512i &sum) {
-  short one = 1;
-  sum = _mm512_madd_epi16(sum, _mm512_set1_epi16(one));
-}
-
-// Two sum version.
-struct ReducedPair {
-  int32_t result[2];
-};
-inline ReducedPair Reduce16to32(__m512i sum1, __m512i sum2) {
-  Convert32Sum(sum1);
-  Convert32Sum(sum2);
-  // 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
-  __m512i pack12 = _mm512_add_epi32(_mm512_unpackhi_epi32(sum1, sum2),
-                                    _mm512_unpacklo_epi32(sum1, sum2));
-  // 1 2 1 2 1 2 1 2
-  __m256i halves = _mm256_add_epi32(_mm512_castsi512_si256(pack12),
-                                    _mm512_extracti64x4_epi64(pack12, (short)1));
-  // 1 2 1 2
-  IntAccess a;
-  a.as_n = _mm_add_epi32(_mm256_castsi256_si128(halves),
-                         _mm256_extracti128_si256(halves, 1));
-  ReducedPair ret;
-  ret.result[0] = a.as_i[0] + a.as_i[2];
-  ret.result[1] = a.as_i[1] + a.as_i[3];
-  return ret;
-}
-
-// Assuming sum1, sum2, sum3, and sum4 are arrays 32-bit signed integers,
-// reduce within each.
-// Returns [sum(sum1), sum(sum2), sum(sum3), sum(sum4)]
-// TODO: consider doing in 64-bit, allowing 4 more bits of quantization?
-inline __m128i Reduce32(__m512i sum1,
-                        __m512i sum2,
-                        __m512i sum3,
-                        __m512i sum4) {
-  // 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
-  __m512i pack12 = _mm512_add_epi32(_mm512_unpackhi_epi32(sum1, sum2),
-                                    _mm512_unpacklo_epi32(sum1, sum2));
-  // 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4
-  __m512i pack34 = _mm512_add_epi32(_mm512_unpackhi_epi32(sum3, sum4),
-                                    _mm512_unpacklo_epi32(sum3, sum4));
-  // 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4
-  __m512i pack1234 = _mm512_add_epi32(_mm512_unpackhi_epi64(pack12, pack34),
-                                      _mm512_unpacklo_epi64(pack12, pack34));
-  // Cut the register into halves and sum those.  1 2 3 4 1 2 3 4
-  __m256i halves = _mm256_add_epi32(_mm512_castsi512_si256(pack1234),
-                                    _mm512_extracti64x4_epi64(pack1234, (short)1));
-  // Again: cut the register into halves and sum those. 1 2 3 4
-  return _mm_add_epi32(_mm256_castsi256_si128(halves),
-                       _mm256_extracti128_si256(halves, 1));
-}
-
-// Four sum version
-inline __m128i Reduce16to32(__m512i sum1,
-                            __m512i sum2,
-                            __m512i sum3,
-                            __m512i sum4) {
-  Convert32Sum(sum1);
-  Convert32Sum(sum2);
-  Convert32Sum(sum3);
-  Convert32Sum(sum4);
-  return Reduce32(sum1, sum2, sum3, sum4);
-}
-
-// Somewhat inefficient reduce for single __m256i containing int32_t
-inline int32_t Reduce32(__m256i halves) {
-  IntAccess a;
-  a.as_n = _mm_add_epi32(_mm256_castsi256_si128(halves),
-                         _mm256_extracti128_si256(halves, 1));
-  // TODO is there a more efficient way?
-  return a.as_i[0] + a.as_i[1] + a.as_i[2] + a.as_i[3];
-}
-
-// Somewhat inefficient reduce for single __m512i containing int32_t
-inline int32_t Reduce32(__m512i sum1) {
-  // Fold register over itself.
-  return Reduce32(_mm256_add_epi32(_mm512_castsi512_si256(sum1),
-                                   _mm512_extracti64x4_epi64(sum1, (short)1)));
-}
-
-inline int32_t Reduce16to32(__m512i sum1) {
-  Convert32Sum(sum1);
-  // Fold register over itself.
-  return Reduce32(_mm256_add_epi32(_mm512_castsi512_si256(sum1),
-                                   _mm512_extracti64x4_epi64(sum1, (short)1)));
-}
-
-class ScatterPut {
-public:
-  explicit ScatterPut(float unquant_mult, int num_B_rows)
-      : unquant_mult_(unquant_mult),
-        unquant_mult_sse_(_mm_set1_ps(unquant_mult)),
-#ifdef __AVX512VL__
-        num_b_rows_scatter_(_mm_set_epi32(num_B_rows * 3 * sizeof(float),
-                                          num_B_rows * 2 * sizeof(float),
-                                          num_B_rows * 1 * sizeof(float),
-                                          num_B_rows * 0 * sizeof(float))),
-#endif
-        num_B_rows_(num_B_rows) {
-  }
-
-  inline void Write(float *base, __m128i reduced) {
-    __m128 float_sums = _mm_cvtepi32_ps(reduced);
-    float_sums = _mm_mul_ps(float_sums, unquant_mult_sse_);
-#ifdef __AVX512VL__
-    // The scatter instruction requires avx512vl
-    _mm_i32scatter_ps(base, num_b_rows_scatter_, float_sums, (short)1);
-#else
-    FloatAccess a;
-    // Get floats for each of the sums to write.
-    a.as_n = float_sums;
-    // Also note that the memory acceses on C are not consecutive, but this is a
-    // tradeoff that we have to make. We can't have consecutive accesses of A,
-    // B, *and* C. But we access A and B a lot more so it makes sense to do it
-    // this way. Scatter to outputs:
-    base[0] = a.as_f[0];
-    base[num_B_rows_] = a.as_f[1];
-    base[2 * num_B_rows_] = a.as_f[2];
-    base[3 * num_B_rows_] = a.as_f[3];
-#endif
-  }
-
-  inline void Write(float *base, ReducedPair reduced) {
-    base[0] = unquant_mult_ * static_cast<float>(reduced.result[0]);
-    base[num_B_rows_] = unquant_mult_ * static_cast<float>(reduced.result[1]);
-  }
-
-  inline void Write(float *base, int32_t reduced) {
-    base[0] = unquant_mult_ * static_cast<float>(reduced);
-  }
-
-private:
-  const float unquant_mult_;
-  const __m128 unquant_mult_sse_;
-#ifdef __AVX512VL__
-  const __m128i num_b_rows_scatter_;
-#endif
-  const int num_B_rows_;
-};
-
-}  // namespace
-
-// This is an AVX512F implementation of int16_t multiply based on Jacob
-// Devlin's SSE code.  The original SSE code was:
-
-// Copyright (c) 2017 Microsoft Corporation
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-// We are multiplying A * B^T, as opposed to A * B. This is important because it
-// means we can do consecutive memory access on A * B^T which allows to to take
-// the most advantage of L1 cache.
-//
-// B is typically a weight matrix, so it can be pre-processed offline, and
-// therefore this transpose does not cost anything. A is typically an activation
-// minibatch matrix. A and B must be 64-byte aligned. C should be the usual
-// 4-byte alignment.
-void AVX_MatrixMult16(const __m512i *A,
-                      const __m512i *B,
-                      float *C,
-                      float unquant_mult,
-                      int num_A_rows,
-                      int num_B_rows,
-                      int width) {
-  assert(width % 32 == 0);
-  assert(reinterpret_cast<uintptr_t>(A) % 64 == 0);
-  assert(reinterpret_cast<uintptr_t>(B) % 64 == 0);
-
-  ScatterPut put(unquant_mult, num_B_rows);
-
-  const int sse_width = width / 32;
-
-  // We do loop unrolling over A. This is *significantly* faster
-  // since B can live in the registers. We are assuming that
-  // A is a multiple of 4, but we can add extra code to handle values of 1,
-  // 2, 3.
-  //
-  // We could also do loop unrolling over B, which adds some additional speedup.
-  // We don't do that for the sake of clarity.
-  //
-  // There are other memory access patterns we could do, e.g., put B on the
-  // outer loop. The justification is that A is typically small enough that it
-  // can live in L1 cache. B is usually a larger weight matrix, so it might not
-  // be able to. However, we are using each element of B four times while it's
-  // still in a register, so caching is not as important.
-
-  // Round down to a multiple of 4.
-  int num_unroll_rows = num_A_rows & ~3;
-  for(int i = 0; i < num_unroll_rows; i += 4) {
-    const __m512i *A1_row = A + (i + 0) * sse_width;
-    const __m512i *A2_row = A + (i + 1) * sse_width;
-    const __m512i *A3_row = A + (i + 2) * sse_width;
-    const __m512i *A4_row = A + (i + 3) * sse_width;
-
-    for(int j = 0; j < num_B_rows; j++) {
-      const __m512i *B_row = B + j * sse_width;
-
-      __m512i sum1 = _mm512_setzero_si512();
-      __m512i sum2 = _mm512_setzero_si512();
-      __m512i sum3 = _mm512_setzero_si512();
-      __m512i sum4 = _mm512_setzero_si512();
-
-      // This is just a simple dot product, unrolled four ways.
-      for(int k = 0; k < sse_width; k++) {
-        __m512i b = *(B_row + k);
-
-        __m512i a1 = *(A1_row + k);
-        __m512i a2 = *(A2_row + k);
-        __m512i a3 = *(A3_row + k);
-        __m512i a4 = *(A4_row + k);
-
-        // madd_epi16 does multiply add on 8 16-bit integers and accumulates
-        // into a four 32-bit register. E.g., a1 = [f1, f2, f3, f4, f5, f6, f7,
-        // h8] (16-bit ints) b1 = [h1, h2, h3, h4, h5, h6, h7, h8] (16-bit ints)
-        // result = [f1*h1 + f2*h2, f3*h3 + f4*h4, f5*h5 + f6*h6, f7*h7 + f8*h8]
-        // (32-bit ints) Then add_epi32 just effectively does a += on these
-        // 32-bit integers.
-        sum1 = _mm512_add_epi32(sum1, _mm512_madd_epi16(b, a1));
-        sum2 = _mm512_add_epi32(sum2, _mm512_madd_epi16(b, a2));
-        sum3 = _mm512_add_epi32(sum3, _mm512_madd_epi16(b, a3));
-        sum4 = _mm512_add_epi32(sum4, _mm512_madd_epi16(b, a4));
-      }
-      put.Write(C + i * num_B_rows + j, Reduce32(sum1, sum2, sum3, sum4));
-    }
-  }
-  // Handle the non-multiples of 4 rows.
-  // TODO: efficient version for 3 rows, 2 rows, etc.
-  for(int i = num_unroll_rows; i < num_A_rows; ++i) {
-    const __m512i *A1_row = A + i * sse_width;
-    for(int j = 0; j < num_B_rows; j++) {
-      __m512i sum1 = _mm512_setzero_si512();
-      for(int k = 0; k < sse_width; k++) {
-        const __m512i *B_row = B + j * sse_width;
-        __m512i b = *(B_row + k);
-        __m512i a1 = *(A1_row + k);
-        sum1 = _mm512_add_epi32(sum1, _mm512_madd_epi16(b, a1));
-      }
-      // TODO is there a more efficient way?
-      *(C + (i)*num_B_rows + j)
-          = unquant_mult * static_cast<float>(Reduce32(sum1));
-    }
-  }
-}
-
-namespace {
-
-/* Three ways considered to apply sign bits:
- * 1. Use 256-bit sign instruction:
- *  __m256i a_first = _mm256_sign_epi8(_mm512_castsi512_si256(a),
- * _mm512_castsi512_si256(b));
- *  __m256i a_second = _mm256_sign_epi8(_mm512_extracti64x4_epi64(a, 1),
- * b_second); a = _mm512_inserti64x4(_mm512_castsi256_si512(a_first), a_second,
- * 1); a = Concat(a_first, a_second);
- *
- * 2. Extract a mask and xor + 1
- *   __mmask64 neg_mask  _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
- *  Use set1 to to build to_xor
- *  a = _mm512_xor_si512(a, to_xor)
- *  And add one:
- *  const __m512i ones8 = _mm512_set1_epi8(1);
- *  a = _mm512_mask_add_epi8(a, neg_mask, a, ones8);
- *
- * 3. Extract a mask and subtract from 0
- * In the outer loop on b:
- *  __mmask64 neg_mask  _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128))
- * For each a:
- *  a = _mm512_mask_sub_epi8(a, neg_mask, _mm512_setzero_si512(), a);
- *
- * Finally, subtraction won the benchmark
- */
-inline void Accum(const __m512i zeros,
-                  __m512i a,
-                  const __m512i b,
-                  const __m512i b_positive,
-                  const __mmask64 neg_mask,
-                  __m512i &sum) {
-  // Apply sign bits.
-  a = _mm512_mask_sub_epi8(a, neg_mask, zeros, a);
-  // The magic 8-bit multiply then horizontal sum into 16-bit.
-  __m512i multiplied = _mm512_maddubs_epi16(b_positive, a);
-  // Now we have 16-bit results that are the sum of two multiplies.
-  // Choosing to approximate and do adds.
-  // Perhaps every so often we could accumulate by Convert32Sum
-  sum = _mm512_adds_epi16(sum, multiplied);
-  b; // make compiler happy
-}
-
-}  // namespace
-
-void AVX_MatrixMult8(const __m512i *A,
-                     const __m512i *B,
-                     float *C,
-                     float unquant_mult,
-                     int num_A_rows,
-                     int num_B_rows,
-                     int width) {
-  assert(width % 32 == 0);
-  assert(reinterpret_cast<uintptr_t>(A) % 64 == 0);
-  assert(reinterpret_cast<uintptr_t>(B) % 64 == 0);
-  ScatterPut put(unquant_mult, num_B_rows);
-  const __m512i zeros = _mm512_setzero_si512();
-
-  const int sse_width = width / 64;
-  int i = 0;
-  int mult8rows = num_A_rows & (~7);
-
-  for(; i < mult8rows; i += 8) {
-    const __m512i *A1_row = A + (i + 0) * sse_width;
-    const __m512i *A2_row = A + (i + 1) * sse_width;
-    const __m512i *A3_row = A + (i + 2) * sse_width;
-    const __m512i *A4_row = A + (i + 3) * sse_width;
-    const __m512i *A5_row = A + (i + 4) * sse_width;
-    const __m512i *A6_row = A + (i + 5) * sse_width;
-    const __m512i *A7_row = A + (i + 6) * sse_width;
-    const __m512i *A8_row = A + (i + 7) * sse_width;
-    for(int j = 0; j < num_B_rows; j++) {
-      const __m512i *B_row = B + j * sse_width;
-      __m512i sum1 = _mm512_setzero_si512();
-      __m512i sum2 = _mm512_setzero_si512();
-      __m512i sum3 = _mm512_setzero_si512();
-      __m512i sum4 = _mm512_setzero_si512();
-      __m512i sum5 = _mm512_setzero_si512();
-      __m512i sum6 = _mm512_setzero_si512();
-      __m512i sum7 = _mm512_setzero_si512();
-      __m512i sum8 = _mm512_setzero_si512();
-      for(int k = 0; k < sse_width; k++) {
-        __m512i b = *(B_row + k);
-        __m512i b_positive = _mm512_abs_epi8(b);
-        /* Didn't seem to make a difference definining sign bits here vs at top
-         */
-        __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
-        Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
-        Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
-        Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
-        Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4);
-        Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5);
-        Accum(zeros, *(A6_row + k), b, b_positive, neg_mask, sum6);
-        Accum(zeros, *(A7_row + k), b, b_positive, neg_mask, sum7);
-        Accum(zeros, *(A8_row + k), b, b_positive, neg_mask, sum8);
-      }
-      put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4));
-      put.Write(C + (i + 4) * num_B_rows + j,
-                Reduce16to32(sum5, sum6, sum7, sum8));
-    }
-  }
-
-  const __m512i *A1_row = A + (i + 0) * sse_width;
-  const __m512i *A2_row = A + (i + 1) * sse_width;
-  const __m512i *A3_row = A + (i + 2) * sse_width;
-  const __m512i *A4_row = A + (i + 3) * sse_width;
-  const __m512i *A5_row = A + (i + 4) * sse_width;
-  const __m512i *A6_row = A + (i + 5) * sse_width;
-  const __m512i *A7_row = A + (i + 6) * sse_width;
-  switch(num_A_rows & 7) {
-    case 7:
-      for(int j = 0; j < num_B_rows; j++) {
-        const __m512i *B_row = B + j * sse_width;
-        __m512i sum1 = _mm512_setzero_si512();
-        __m512i sum2 = _mm512_setzero_si512();
-        __m512i sum3 = _mm512_setzero_si512();
-        __m512i sum4 = _mm512_setzero_si512();
-        __m512i sum5 = _mm512_setzero_si512();
-        __m512i sum6 = _mm512_setzero_si512();
-        __m512i sum7 = _mm512_setzero_si512();
-        for(int k = 0; k < sse_width; k++) {
-          __m512i b = *(B_row + k);
-          __m512i b_positive = _mm512_abs_epi8(b);
-          __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
-          Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
-          Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
-          Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
-          Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4);
-          Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5);
-          Accum(zeros, *(A6_row + k), b, b_positive, neg_mask, sum6);
-          Accum(zeros, *(A7_row + k), b, b_positive, neg_mask, sum7);
-        }
-        put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4));
-        put.Write(C + (i + 4) * num_B_rows + j, Reduce16to32(sum5, sum6));
-        put.Write(C + (i + 6) * num_B_rows + j, Reduce16to32(sum7));
-      }
-      /* fall through */
-    case 6:
-      for(int j = 0; j < num_B_rows; j++) {
-        const __m512i *B_row = B + j * sse_width;
-        __m512i sum1 = _mm512_setzero_si512();
-        __m512i sum2 = _mm512_setzero_si512();
-        __m512i sum3 = _mm512_setzero_si512();
-        __m512i sum4 = _mm512_setzero_si512();
-        __m512i sum5 = _mm512_setzero_si512();
-        __m512i sum6 = _mm512_setzero_si512();
-        for(int k = 0; k < sse_width; k++) {
-          __m512i b = *(B_row + k);
-          __m512i b_positive = _mm512_abs_epi8(b);
-          __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
-          Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
-          Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
-          Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
-          Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4);
-          Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5);
-          Accum(zeros, *(A6_row + k), b, b_positive, neg_mask, sum6);
-        }
-        put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4));
-        put.Write(C + (i + 4) * num_B_rows + j, Reduce16to32(sum5, sum6));
-      }
-      /* fall through */
-    case 5:
-      for(int j = 0; j < num_B_rows; j++) {
-        const __m512i *B_row = B + j * sse_width;
-        __m512i sum1 = _mm512_setzero_si512();
-        __m512i sum2 = _mm512_setzero_si512();
-        __m512i sum3 = _mm512_setzero_si512();
-        __m512i sum4 = _mm512_setzero_si512();
-        __m512i sum5 = _mm512_setzero_si512();
-        for(int k = 0; k < sse_width; k++) {
-          __m512i b = *(B_row + k);
-          __m512i b_positive = _mm512_abs_epi8(b);
-          __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
-          Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
-          Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
-          Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
-          Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4);
-          Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5);
-        }
-        put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4));
-        put.Write(C + (i + 4) * num_B_rows + j, Reduce16to32(sum5));
-      }
-      /* fall through */
-    case 4:
-      for(int j = 0; j < num_B_rows; j++) {
-        const __m512i *B_row = B + j * sse_width;
-        __m512i sum1 = _mm512_setzero_si512();
-        __m512i sum2 = _mm512_setzero_si512();
-        __m512i sum3 = _mm512_setzero_si512();
-        __m512i sum4 = _mm512_setzero_si512();
-        for(int k = 0; k < sse_width; k++) {
-          __m512i b = *(B_row + k);
-          __m512i b_positive = _mm512_abs_epi8(b);
-          __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
-          Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
-          Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
-          Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
-          Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4);
-        }
-        put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4));
-      }
-      /* fall through */
-    case 3:
-      for(int j = 0; j < num_B_rows; j++) {
-        const __m512i *B_row = B + j * sse_width;
-        __m512i sum1 = _mm512_setzero_si512();
-        __m512i sum2 = _mm512_setzero_si512();
-        __m512i sum3 = _mm512_setzero_si512();
-        for(int k = 0; k < sse_width; k++) {
-          __m512i b = *(B_row + k);
-          __m512i b_positive = _mm512_abs_epi8(b);
-          __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
-          Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
-          Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
-          Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3);
-        }
-        put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2));
-        put.Write(C + (i + 2) * num_B_rows + j, Reduce16to32(sum3));
-      }
-      /* fall through */
-    case 2:
-      for(int j = 0; j < num_B_rows; j++) {
-        const __m512i *B_row = B + j * sse_width;
-        __m512i sum1 = _mm512_setzero_si512();
-        __m512i sum2 = _mm512_setzero_si512();
-        for(int k = 0; k < sse_width; k++) {
-          __m512i b = *(B_row + k);
-          __m512i b_positive = _mm512_abs_epi8(b);
-          __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
-          Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
-          Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2);
-        }
-        put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2));
-      }
-      /* fall through */
-    case 1:
-      for(int j = 0; j < num_B_rows; j++) {
-        const __m512i *B_row = B + j * sse_width;
-        __m512i sum1 = _mm512_setzero_si512();
-        for(int k = 0; k < sse_width; k++) {
-          __m512i b = *(B_row + k);
-          __m512i b_positive = _mm512_abs_epi8(b);
-          __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128));
-          Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1);
-        }
-        put.Write(C + i * num_B_rows + j, Reduce16to32(sum1));
-      }
-  }
-}
-
-}  // namespace int16
-}  // namespace cpu
-}  // namespace marian
-#endif
diff --git a/src/tensors/cpu/sharp/int_gemm.cpp b/src/tensors/cpu/sharp/int_gemm.cpp
deleted file mode 100644
index cdb7cbf07..000000000
--- a/src/tensors/cpu/sharp/int_gemm.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-#include "int_gemm.h"
-#include "tensors/tensor_allocator.h"
-#include "tensors/tensor_operators.h"
-
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <tmmintrin.h>
-#include <xmmintrin.h>
-#include <cassert>
-#include <cstddef>
-
-namespace marian {
-namespace cpu {
-namespace int16 {
-
-#ifdef __AVX512F__
-void AVX_Quantize16(const float* input,
-                    int16_t* output,
-                    float quant_mult,
-                    std::size_t size);
-
-void AVX_Quantize8(const float* input,
-                   int8_t* output,
-                   float quant_mult,
-                   std::size_t size);
-
-void AVX_MatrixMult16(const __m512i* A,
-                      const __m512i* B,
-                      float* C,
-                      float unquant_mult,
-                      int num_A_rows,
-                      int num_B_rows,
-                      int width);
-
-void AVX_MatrixMult8(const __m512i* A,
-                     const __m512i* B,
-                     float* C,
-                     float unquant_mult,
-                     int num_A_rows,
-                     int num_B_rows,
-                     int width);
-#endif
-
-void SSE_Quantize16(const float* input,
-                    __m128i* output,
-                    float quant_mult,
-                    int num_rows,
-                    int width);
-
-void SSE_MatrixMult16(const __m128i* A,
-                      const __m128i* B,
-                      float* C,
-                      float unquant_mult,
-                      int num_A_rows,
-                      int num_B_rows,
-                      int width);
-
-void Quantize16(marian::Tensor out,
-                const marian::Tensor in,
-                float /*clipValue*/) {
-  float quant_mult = (float)pow(2.0, BITS);
-#ifdef __AVX512F__
-  AVX_Quantize16(
-      in->data(), out->data<int16_t>(), quant_mult, in->shape().elements());
-#else
-  int num_rows = in->shape().elements() / in->shape()[-1];
-  int width = in->shape()[-1];
-  SSE_Quantize16(in->data(), out->data<__m128i>(), quant_mult, num_rows, width);
-#endif
-}
-
-void Quantize8(marian::Tensor out,
-               const marian::Tensor in,
-               float clipValue) {
-#ifdef __AVX512F__
-  float quant_mult = 127.0f / clipValue;
-  AVX_Quantize8(
-      in->data(), out->data<int8_t>(), quant_mult, in->shape().elements());
-#else
-  out; in; clipValue;
-  ABORT("8-bit is currently only AVX512");
-#endif
-}
-
-// This operates on floats after processing so doesn't care about int8_t vs
-// int16_t.
-void AddBias(marian::Tensor C, const marian::Tensor Bias) {
-  float* y = C->data();
-  const float* x = C->data();
-  const float* bias = Bias->data();
-
-  const int m = C->shape().elements() / C->shape()[-1];
-  const int n = C->shape()[-1];
-
-  for(int j = 0; j < m; ++j) {
-    int i = 0;
-#ifdef __AVX512F__
-    int n16 = n & ~15;
-    for(; i < n16; i += 16) {
-      __m512 ai = _mm512_loadu_ps(x + j * n + i);
-      __m512 bi = _mm512_loadu_ps(bias + i);
-      __m512 yi = _mm512_add_ps(ai, bi);
-      _mm512_storeu_ps(y + j * n + i, yi);
-    }
-#else
-    int n4 = (n / 4) * 4;
-    for(; i < n4; i += 4) {
-      __m128 ai = _mm_loadu_ps(x + j * n + i);
-      __m128 bi = _mm_loadu_ps(bias + i);
-      __m128 yi = _mm_add_ps(ai, bi);
-      _mm_storeu_ps(y + j * n + i, yi);
-    }
-#endif
-    for(; i < n; i++) {
-      y[j * n + i] = x[j * n + i] + bias[i];
-    }
-  }
-}
-
-void ProdInt16(marian::Tensor C,
-               const marian::Tensor A,
-               const marian::Tensor B,
-               float scale) {
-  ABORT_IF(scale != 1, "Scale other than 1 not supported");
-
-  // @TODO: make this a parameter
-  float quant_mult = (float)pow(2.0, BITS);
-
-  // If we quantize to n bits and then multiple the values together, the result
-  // will be quantized to n^2 bits. So we must divide by 1.0/(n^2) to get back
-  // the original value.
-  float unquant_mult = 1.0f / (quant_mult * quant_mult);
-
-  float* fC = C->data();
-  int num_A_rows = A->shape().elements() / A->shape()[-1];
-  int num_B_rows = B->shape().elements() / B->shape()[-1];
-  int width = B->shape()[-1];
-#ifdef __AVX512F__
-  AVX_MatrixMult16(A->data<__m512i>(),
-                   B->data<__m512i>(),
-                   fC,
-                   unquant_mult,
-                   num_A_rows,
-                   num_B_rows,
-                   width);
-#else
-  SSE_MatrixMult16(A->data<__m128i>(),
-                   B->data<__m128i>(),
-                   fC,
-                   unquant_mult,
-                   num_A_rows,
-                   num_B_rows,
-                   width);
-#endif
-}
-
-void ProdInt8(marian::Tensor C,
-              const marian::Tensor A,
-              const marian::Tensor B,
-              float scale,
-              float clipValue) {
-#ifdef __AVX512F__
-  // This would be easy...
-  ABORT_IF(scale != 1, "Scale other than 1 not supported");
-  float quant_mult = 127.0f / clipValue;
-  float unquant_mult = 1.0f / (quant_mult * quant_mult);
-
-  float* fC = C->data();
-  int num_A_rows = A->shape().elements() / A->shape()[-1];
-  int num_B_rows = B->shape().elements() / B->shape()[-1];
-  int width = B->shape()[-1];
-  AVX_MatrixMult8(A->data<__m512i>(),
-                  B->data<__m512i>(),
-                  fC,
-                  unquant_mult,
-                  num_A_rows,
-                  num_B_rows,
-                  width);
-#else
-  C; A; B; scale; clipValue;
-  ABORT("8-bit is currently only AVX512");
-#endif
-}
-
-}  // namespace int16
-}  // namespace cpu
-}  // namespace marian
diff --git a/src/tensors/cpu/sharp/int_gemm.h b/src/tensors/cpu/sharp/int_gemm.h
deleted file mode 100644
index 3ae231562..000000000
--- a/src/tensors/cpu/sharp/int_gemm.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-
-#include "tensors/tensor.h"
-
-namespace marian {
-namespace cpu {
-namespace int16 {
-
-const int BITS = 10;
-
-void Quantize16(marian::Tensor out,
-                const marian::Tensor in,
-                float /*clipValue*/);
-
-void Quantize8(marian::Tensor out,
-               const marian::Tensor in,
-               float clipValue);
-
-// This operates on floats after processing so doesn't care about int8_t vs
-// int16_t.
-void AddBias(marian::Tensor C, const marian::Tensor Bias);
-
-void ProdInt16(marian::Tensor C,
-               const marian::Tensor A,
-               const marian::Tensor B,
-               float scale);
-
-void ProdInt8(marian::Tensor C,
-              const marian::Tensor A,
-              const marian::Tensor B,
-              float scale,
-              float clipValue);
-
-}  // namespace int16
-}  // namespace cpu
-}  // namespace marian
diff --git a/src/tensors/cpu/sharp/sse_gemm.cpp b/src/tensors/cpu/sharp/sse_gemm.cpp
deleted file mode 100644
index ea6f25cba..000000000
--- a/src/tensors/cpu/sharp/sse_gemm.cpp
+++ /dev/null
@@ -1,341 +0,0 @@
-// Copyright (c) 2017 Microsoft Corporation
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <math.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <tmmintrin.h>
-#include <xmmintrin.h>
-#include <cassert>
-
-namespace marian {
-namespace cpu {
-namespace int16 {
-
-// This is a reference implementation of 16-bit matrix multiplication described
-// in "Sharp Models on Dull Hardware: Fast and Accurate Neural Machine
-// Translation Decoding on the CPU". This model is not as fast as the one in the
-// paper, becuase it uses SSE2 instead of AVX2. AVX2 instructions are only
-// available on more modern CPUs (Haswell or later). The only difference between
-// SSE2 and AVX2 is that SSE operates on 128-bit vectors and AVX2 operates on
-// 256-bit vecetors. So AVX2 can fit 16 16-bit integers intead of 8 8-bit
-// integers. The algorithm is the same, you just replace these instructions with
-// their 256-bit counterpart, i.e., _mm256_add_epi32, _mm256_madd_epi16,
-// _mm256_hadd_epi32, ... Additional improvements can also be made from
-// unrolling the for loop over num_B_rows in SSE_MatrixMult, which is not done
-// here for clarity.
-
-// ***************************************
-// ************** IMPORTANT **************
-// ***************************************
-// The biggest "gotcha" when using this type of multiplication is dealing with
-// overflow related to quantization. It is NOT enough to simply ensure that A
-// and B fit into 16 bit integers. If A and B are quantized with $n$ bits, the
-// result of multiplying them together will be quantized to $n^2$ bits. So if
-// they are near the boundary of the 16-bit mark, then the result will be near
-// 32-bits and overflow. However, if we use, say, n = 10 bits, then the product
-// is 20 bits. This gives us 12 bits left over for the accumulation. So as long
-// as the width of the common dimension is less than 2^12 = 4096, it is
-// *impossible* to overflow. If we used, say, n = 12 bits, then we have
-// 32-(12*2) = 8 bits left over. So we *could* overflow if width > 2^8.
-//
-// So, the tradeoff is between quantization precision and possibility of
-// overflow. A good general value is 10 bits, since this gives high precision
-// (precision is 1/2^10 ~= 0.001, which is more than what's needed for almost
-// all neural nets), and cannot overflow unless the matrix width is > 4096.
-
-// This quantizes floating point values into fixed-point 16-bit integers.
-// Effectively, we are performing an SSE version of float x = ...; int16_t y =
-// (int16_t)(quant_mult*x);
-//
-// Except that the casting is saturated. However, you should always ensure that
-// the input fits into a fixed range anyways. I.e., you should ensure that
-// quant_mult*x fits into the range [-2^15, 2^15]. This should always be
-// possible because the value you're quantizing will either be NN weights or NN
-// activations, both of which can be clipped to a fixed range during training.
-
-void SSE_Quantize16(const float* input,
-                    __m128i* output,
-                    float quant_mult,
-                    int num_rows,
-                    int width) {
-  assert(width % 8 == 0);
-
-  int num_input_chunks = width / 8;
-
-  // Fill an SSE float with 4 copies of the quant mult
-  __m128 sse_quant_mult
-      = _mm_set_ps(quant_mult, quant_mult, quant_mult, quant_mult);
-
-  for(int i = 0; i < num_rows; i++) {
-    const float* input_row = input + i * width;
-    __m128i* output_row = output + i * num_input_chunks;
-    for(int j = 0; j < num_input_chunks; j++) {
-      const float* x = input_row + j * 8;
-      // Process 8 floats at once, since each __m128i can contain 8 16-bit
-      // integers.
-
-      // Load floats floats into SSE registers.
-      __m128 f_0 = _mm_loadu_ps(x);
-      __m128 f_1 = _mm_loadu_ps(x + 4);
-
-      // Multiply by quantization factor (e.g., if quant_mult = 1000.0, 0.34291
-      // --> 342.21)
-      __m128 m_0 = _mm_mul_ps(f_0, sse_quant_mult);
-      __m128 m_1 = _mm_mul_ps(f_1, sse_quant_mult);
-
-      // Cast float to 32-bit int (e.g., 342.21 --> 342)
-      __m128i i_0 = _mm_cvtps_epi32(m_0);
-      __m128i i_1 = _mm_cvtps_epi32(m_1);
-
-      // Cast 32-bit int to 16-bit int. You must ensure that these fit into the
-      // 16-bit range by clipping values during training.
-      *(output_row + j) = _mm_packs_epi32(i_0, i_1);
-    }
-  }
-}
-
-// We are multiplying A * B^T, as opposed to A * B. This is important because it
-// means we can do consecutive memory access on A * B^T which allows to to take
-// the most advantage of L1 cache.
-//
-// B is typically a weight matrix, so it can be pre-processed offline, and
-// therefore this transpose does not cost anything. A is typically an activation
-// minibatch matrix.
-void SSE_MatrixMult16(const __m128i* qA,
-                      const __m128i* qB,
-                      float* fC,
-                      float unquant_mult,
-                      int num_A_rows,
-                      int num_B_rows,
-                      int width) {
-  assert(width % 8 == 0);
-
-  int sse_width = width / 8;
-
-  // We do loop unrolling over A. This is *significantly* faster
-  // since B can live in the registers. We are assuming that
-  // A is a multiple of 4, but we can add extra code to handle values of 1,
-  // 2, 3.
-  //
-  // We could also do loop unrolling over B, which adds some additional speedup.
-  // We don't do that for the sake of clarity.
-  //
-  // There are other memory access patterns we could do, e.g., put B on the
-  // outer loop. The justification is that A is typically small enough that it
-  // can live in L1 cache. B is usually a larger weight matrix, so it might not
-  // be able to. However, we are using each element of B four times while it's
-  // still in a register, so caching is not as important.
-
-  int mult4 = (num_A_rows / 4) * 4;
-  int rest = num_A_rows % 4;
-
-  int i = 0;
-  for(; i < mult4; i += 4) {
-    const __m128i* A1_row = qA + (i + 0) * sse_width;
-    const __m128i* A2_row = qA + (i + 1) * sse_width;
-    const __m128i* A3_row = qA + (i + 2) * sse_width;
-    const __m128i* A4_row = qA + (i + 3) * sse_width;
-
-    for(int j = 0; j < num_B_rows; j++) {
-      const __m128i* B_row = qB + j * sse_width;
-
-      __m128i sum1 = _mm_setzero_si128();
-      __m128i sum2 = _mm_setzero_si128();
-      __m128i sum3 = _mm_setzero_si128();
-      __m128i sum4 = _mm_setzero_si128();
-
-      // This is just a simple dot product, unrolled four ways.
-      for(int k = 0; k < sse_width; k++) {
-        __m128i b = *(B_row + k);
-
-        __m128i a1 = *(A1_row + k);
-        __m128i a2 = *(A2_row + k);
-        __m128i a3 = *(A3_row + k);
-        __m128i a4 = *(A4_row + k);
-
-        // _mm_madd_epi16 does multiply add on 8 16-bit integers and accumulates
-        // into a four 32-bit register. E.g., a1 = [f1, f2, f3, f4, f5, f6, f7,
-        // h8] (16-bit ints) b1 = [h1, h2, h3, h4, h5, h6, h7, h8] (16-bit ints)
-        // result = [f1*h1 + f2*h2, f3*h3 + f4*h4, f5*h5 + f6*h6, f7*h7 + f8*h8]
-        // (32-bit ints) Then _mm_add_epi32 just effectively does a += on these
-        // 32-bit integers.
-        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1));
-        sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(b, a2));
-        sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(b, a3));
-        sum4 = _mm_add_epi32(sum4, _mm_madd_epi16(b, a4));
-      }
-
-      // We now have each sum spread across 4 32-bit ints in SSE register, e.g.,
-      // sum1 = [r1, r2, r3, r4]. We need to compute r1 + r2 + r3 + r4.
-      //
-      // This uses 'horizontal add' to do that efficiently. The first add gets
-      // us [r1 + r2, r2 + r3, r1 + r2, r2 + r3] Then the second gets us. [r1 +
-      // r2 + r2 + r3, r2 + r3 + r1 + r2, r1 + r2 + r2 + r3, r2 + r3 + r1 + r2]
-      // E.g., each 32-bit in contains the full sum.
-      sum1 = _mm_hadd_epi32(sum1, sum1);
-      sum1 = _mm_hadd_epi32(sum1, sum1);
-      sum2 = _mm_hadd_epi32(sum2, sum2);
-      sum2 = _mm_hadd_epi32(sum2, sum2);
-      sum3 = _mm_hadd_epi32(sum3, sum3);
-      sum3 = _mm_hadd_epi32(sum3, sum3);
-      sum4 = _mm_hadd_epi32(sum4, sum4);
-      sum4 = _mm_hadd_epi32(sum4, sum4);
-
-      float* C1 = fC + (i + 0) * num_B_rows + j;
-      float* C2 = fC + (i + 1) * num_B_rows + j;
-      float* C3 = fC + (i + 2) * num_B_rows + j;
-      float* C4 = fC + (i + 3) * num_B_rows + j;
-
-      // Now that we have the full sum in each 32-bit register, we convert them
-      // to an integer with _mm_cvtepi32_ps and take the first one with
-      // _mm_store_ss. We don't use an SSE instruction to unquantize, although
-      // we could. It doesn't really matter since most of the computation is in
-      // the above loop over the width.
-      //
-      // Also note that the memory acceses on C are not consecutive, but this is
-      // a tradeoff that we have to make. We can't have consecutive accesses of
-      // qA, qB, *and* C. But we access qA and qB a lot more so it makes sense
-      // to do it this way.
-      _mm_store_ss(C1, _mm_cvtepi32_ps(sum1));
-      *(C1) *= unquant_mult;
-
-      _mm_store_ss(C2, _mm_cvtepi32_ps(sum2));
-      *(C2) *= unquant_mult;
-
-      _mm_store_ss(C3, _mm_cvtepi32_ps(sum3));
-      *(C3) *= unquant_mult;
-
-      _mm_store_ss(C4, _mm_cvtepi32_ps(sum4));
-      *(C4) *= unquant_mult;
-    }
-  }
-  if(rest == 1) {
-    const __m128i* A1_row = qA + (i + 0) * sse_width;
-
-    for(int j = 0; j < num_B_rows; j++) {
-      const __m128i* B_row = qB + j * sse_width;
-
-      __m128i sum1 = _mm_setzero_si128();
-
-      // This is just a simple dot product, unrolled four ways.
-      for(int k = 0; k < sse_width; k++) {
-        __m128i b = *(B_row + k);
-
-        __m128i a1 = *(A1_row + k);
-        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1));
-      }
-
-      sum1 = _mm_hadd_epi32(sum1, sum1);
-      sum1 = _mm_hadd_epi32(sum1, sum1);
-
-      float* C1 = fC + (i + 0) * num_B_rows + j;
-
-      _mm_store_ss(C1, _mm_cvtepi32_ps(sum1));
-      *(C1) *= unquant_mult;
-    }
-  } else if(rest == 2) {
-    const __m128i* A1_row = qA + (i + 0) * sse_width;
-    const __m128i* A2_row = qA + (i + 1) * sse_width;
-
-    for(int j = 0; j < num_B_rows; j++) {
-      const __m128i* B_row = qB + j * sse_width;
-
-      __m128i sum1 = _mm_setzero_si128();
-      __m128i sum2 = _mm_setzero_si128();
-
-      for(int k = 0; k < sse_width; k++) {
-        __m128i b = *(B_row + k);
-
-        __m128i a1 = *(A1_row + k);
-        __m128i a2 = *(A2_row + k);
-
-        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1));
-        sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(b, a2));
-      }
-
-      sum1 = _mm_hadd_epi32(sum1, sum1);
-      sum1 = _mm_hadd_epi32(sum1, sum1);
-      sum2 = _mm_hadd_epi32(sum2, sum2);
-      sum2 = _mm_hadd_epi32(sum2, sum2);
-
-      float* C1 = fC + (i + 0) * num_B_rows + j;
-      float* C2 = fC + (i + 1) * num_B_rows + j;
-
-      _mm_store_ss(C1, _mm_cvtepi32_ps(sum1));
-      *(C1) *= unquant_mult;
-
-      _mm_store_ss(C2, _mm_cvtepi32_ps(sum2));
-      *(C2) *= unquant_mult;
-    }
-  } else if(rest == 3) {
-    const __m128i* A1_row = qA + (i + 0) * sse_width;
-    const __m128i* A2_row = qA + (i + 1) * sse_width;
-    const __m128i* A3_row = qA + (i + 2) * sse_width;
-
-    for(int j = 0; j < num_B_rows; j++) {
-      const __m128i* B_row = qB + j * sse_width;
-
-      __m128i sum1 = _mm_setzero_si128();
-      __m128i sum2 = _mm_setzero_si128();
-      __m128i sum3 = _mm_setzero_si128();
-
-      for(int k = 0; k < sse_width; k++) {
-        __m128i b = *(B_row + k);
-
-        __m128i a1 = *(A1_row + k);
-        __m128i a2 = *(A2_row + k);
-        __m128i a3 = *(A3_row + k);
-
-        sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1));
-        sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(b, a2));
-        sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(b, a3));
-      }
-
-      sum1 = _mm_hadd_epi32(sum1, sum1);
-      sum1 = _mm_hadd_epi32(sum1, sum1);
-      sum2 = _mm_hadd_epi32(sum2, sum2);
-      sum2 = _mm_hadd_epi32(sum2, sum2);
-      sum3 = _mm_hadd_epi32(sum3, sum3);
-      sum3 = _mm_hadd_epi32(sum3, sum3);
-
-      float* C1 = fC + (i + 0) * num_B_rows + j;
-      float* C2 = fC + (i + 1) * num_B_rows + j;
-      float* C3 = fC + (i + 2) * num_B_rows + j;
-
-      _mm_store_ss(C1, _mm_cvtepi32_ps(sum1));
-      *(C1) *= unquant_mult;
-
-      _mm_store_ss(C2, _mm_cvtepi32_ps(sum2));
-      *(C2) *= unquant_mult;
-
-      _mm_store_ss(C3, _mm_cvtepi32_ps(sum3));
-      *(C3) *= unquant_mult;
-    }
-  }
-}
-
-}  // namespace int16
-}  // namespace cpu
-}  // namespace marian
diff --git a/src/tensors/gpu/backend.h b/src/tensors/gpu/backend.h
index 9bb97174f..75cc604da 100644
--- a/src/tensors/gpu/backend.h
+++ b/src/tensors/gpu/backend.h
@@ -66,16 +66,6 @@ class Backend : public marian::Backend {
 
   CudaCompute getCudaComputeCapability() { return compute_; }
 
-  // for CPU, sets to use optimized code for inference.
-  // for GPU, this is invalid. for gpu, isOptimized() function always returns false.
-  void setOptimized(bool optimize) override {
-    LOG_ONCE(info, "setOptimized() not supported for GPU_{}", optimize);
-  }
-  
-  bool isOptimized() override {
-    return false;
-  }
-
 private:
   cublasHandle_t cublasHandle_{0};     // make sure it's 0, so it can be initalized lazily
   cusparseHandle_t cusparseHandle_{0}; // as above
diff --git a/src/tests/prod.cpp b/src/tests/prod.cpp
index 698712b9c..aae9f5573 100644
--- a/src/tests/prod.cpp
+++ b/src/tests/prod.cpp
@@ -7,7 +7,9 @@ int main(int /*argc*/, char** /*argv*/) {
     {
         auto g = New<ExpressionGraph>(true);
         g->setDevice({0, DeviceType::cpu});
-        g->getBackend()->setOptimized(false);
+#if 0 // this file is not a real test, just used for manual stuff. Disable here by hand for now.
+        g->getBackend()->setInt16(false);
+#endif
         g->reserveWorkspaceMB(2512);
 
         timer::AutoTimer timer;
@@ -40,7 +42,44 @@ int main(int /*argc*/, char** /*argv*/) {
     {
         auto g = New<ExpressionGraph>(true);
         g->setDevice({0, DeviceType::cpu});
-        g->getBackend()->setOptimized(true);
+#if 0
+        g->getBackend()->setInt16(true);
+#endif
+        g->reserveWorkspaceMB(2512);
+
+        timer::AutoTimer timer;
+        for(int i = 0; i < 100; ++i) {
+            g->clear();
+
+            auto x = g->constant({1, 4, 8, 256}, inits::glorotUniform());
+
+            auto W1 = g->param("W1", {256, 2048}, inits::glorotUniform());
+            auto b1 = g->param("b1", {1, 2048}, inits::glorotUniform());
+
+            auto out = affine(x, W1, b1);
+
+            for(int i = 2; i < 20; ++i) {
+                auto Wi = g->param("W" + std::to_string(i), {2048, 2048}, inits::glorotUniform());
+                auto bi = g->param("b" + std::to_string(i), {1, 2048}, inits::glorotUniform());
+
+                out = relu(affine(out, Wi, bi));
+            }
+
+            auto Wn = g->param("Wn", {2048, 256}, inits::glorotUniform());
+            auto bn = g->param("bn", {1, 256}, inits::glorotUniform());
+
+            auto y = affine(out, Wn, bn);
+
+            g->forward();
+        }
+    }
+
+    {
+        auto g = New<ExpressionGraph>(true);
+        g->setDevice({0, DeviceType::cpu});
+#if 0
+        g->getBackend()->setInt8(true);
+#endif
         g->reserveWorkspaceMB(2512);
 
         timer::AutoTimer timer;
diff --git a/src/training/graph_group_async.cpp b/src/training/graph_group_async.cpp
index 4636f4b0c..e47074603 100644
--- a/src/training/graph_group_async.cpp
+++ b/src/training/graph_group_async.cpp
@@ -19,7 +19,6 @@ AsyncGraphGroup::AsyncGraphGroup(Ptr<Options> config, Ptr<IMPIWrapper> mpi)
     auto graph = New<ExpressionGraph>();
     graph->setDevice(device);
     graph->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
-    graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
     graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
     graphs_.push_back(graph);
     shardOpt_.push_back(Optimizer(options_));
diff --git a/src/training/graph_group_singleton.h b/src/training/graph_group_singleton.h
index 8a93af71a..74d384987 100644
--- a/src/training/graph_group_singleton.h
+++ b/src/training/graph_group_singleton.h
@@ -35,7 +35,6 @@ class SingletonGraph : public GraphGroup, public ExponentialSmoothing {
     graph_ = New<ExpressionGraph>();
     graph_->setDevice(deviceId);
     graph_->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
-    graph_->getBackend()->setClip(options_->get<float>("clip-gemm"));
     graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
     opt_ = Optimizer(options_);
     builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
diff --git a/src/training/graph_group_sync.cpp b/src/training/graph_group_sync.cpp
index eaeefb42b..de6d5e5f0 100755
--- a/src/training/graph_group_sync.cpp
+++ b/src/training/graph_group_sync.cpp
@@ -12,7 +12,6 @@ SyncGraphGroup::SyncGraphGroup(Ptr<Options> config, Ptr<IMPIWrapper> mpi)
     graph->setDevice(device);
     graph->setCheckpointing(options_->get<bool>("gradient-checkpointing"));
     graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
-    graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
 
     graphs_.push_back(graph);
     shardOpt_.push_back(Optimizer(options_));
diff --git a/src/translator/translator.h b/src/translator/translator.h
index 15eb98702..1ff19a4ae 100755
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@@ -87,10 +87,6 @@ class Translate : public ModelTask {
         auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
         graph->setDefaultElementType(typeFromString(prec[0]));
         graph->setDevice(device);
-        graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
-        if (device.type == DeviceType::cpu) {
-          graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
-        }
         graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
         graphs_[id] = graph;
 
@@ -229,10 +225,6 @@ class TranslateService : public ModelServiceTask {
       auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
       graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
       graph->setDevice(device);
-      graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
-      if (device.type == DeviceType::cpu) {
-        graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
-      }
       graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
       graphs_.push_back(graph);