From 600f5cbdecb027be2488297cca6a825b1f56af9a Mon Sep 17 00:00:00 2001 From: Nikolay Bogoychev Date: Mon, 25 Jan 2021 00:02:30 +0000 Subject: [PATCH] Integrate intgemm into marian (#595) Adds intgemm as a module for Marian. Intgemm is @kpu 's 8/16 bit gemm library with support for architectures from SSE2 to AVX512VNNI Removes outdated integer code, related to the --optimize option Co-authored-by: Kenneth Heafield Co-authored-by: Kenneth Heafield Co-authored-by: Ulrich Germann Co-authored-by: Marcin Junczys-Dowmunt Co-authored-by: Roman Grundkiewicz --- .gitmodules | 3 + CHANGELOG.md | 5 + CMakeLists.txt | 5 +- regression-tests | 2 +- src/3rd_party/CMakeLists.txt | 5 + src/3rd_party/intgemm | 1 + src/CMakeLists.txt | 8 +- src/command/marian_conv.cpp | 28 +- src/common/binary.cpp | 21 +- src/common/config_parser.cpp | 15 +- src/common/types.cpp | 11 +- src/common/types.h | 174 +++-- src/data/shortlist.h | 8 + src/embedder/embedder.h | 5 - src/graph/expression_operators.cpp | 48 +- src/microsoft/quicksand.cpp | 3 +- src/rescorer/rescorer.h | 5 - src/tensors/backend.h | 13 +- src/tensors/cpu/aligned.h | 43 ++ src/tensors/cpu/backend.h | 8 +- src/tensors/cpu/device.cpp | 38 +- src/tensors/cpu/expression_graph_packable.h | 266 ++++++++ src/tensors/cpu/fbgemm/expanded_gemm.h | 19 +- .../cpu/fbgemm/expression_graph_packable.h | 156 ----- src/tensors/cpu/int16.h | 113 ---- src/tensors/cpu/integer_common.cpp | 45 ++ src/tensors/cpu/integer_common.h | 223 +++++++ src/tensors/cpu/intgemm_interface.h | 132 ++++ src/tensors/cpu/prod.cpp | 13 +- src/tensors/cpu/sharp/avx_gemm.cpp | 615 ------------------ src/tensors/cpu/sharp/int_gemm.cpp | 187 ------ src/tensors/cpu/sharp/int_gemm.h | 36 - src/tensors/cpu/sharp/sse_gemm.cpp | 341 ---------- src/tensors/gpu/backend.h | 10 - src/tests/prod.cpp | 43 +- src/training/graph_group_async.cpp | 1 - src/training/graph_group_singleton.h | 1 - src/training/graph_group_sync.cpp | 1 - src/translator/translator.h | 8 - 39 files changed, 986 insertions(+), 1673 deletions(-) create mode 160000 src/3rd_party/intgemm create mode 100644 src/tensors/cpu/aligned.h create mode 100644 src/tensors/cpu/expression_graph_packable.h delete mode 100644 src/tensors/cpu/fbgemm/expression_graph_packable.h delete mode 100644 src/tensors/cpu/int16.h create mode 100644 src/tensors/cpu/integer_common.cpp create mode 100644 src/tensors/cpu/integer_common.h create mode 100644 src/tensors/cpu/intgemm_interface.h delete mode 100644 src/tensors/cpu/sharp/avx_gemm.cpp delete mode 100644 src/tensors/cpu/sharp/int_gemm.cpp delete mode 100644 src/tensors/cpu/sharp/int_gemm.h delete mode 100644 src/tensors/cpu/sharp/sse_gemm.cpp diff --git a/.gitmodules b/.gitmodules index 6cb63fc0b..a1a876d8b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -14,6 +14,9 @@ path = src/3rd_party/fbgemm url = https://github.com/marian-nmt/FBGEMM branch = master +[submodule "src/3rd_party/intgemm"] + path = src/3rd_party/intgemm + url = https://github.com/marian-nmt/intgemm/ [submodule "src/3rd_party/simple-websocket-server"] path = src/3rd_party/simple-websocket-server url = https://github.com/marian-nmt/Simple-WebSocket-Server diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e71c7daf..3ad291ee8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] ### Added +- Added `intgemm8(ssse3|avx|avx512)?`, `intgemm16(sse2|avx|avx512)?` types to marian-conv with uses intgemm backend. Types intgemm8 and intgemm16 are hardware-agnostic, the other ones hardware-specific. +- Shortlist is now always multiple-of-eight. +- Added intgemm 8/16bit integer binary architecture agnostic format. - Add --train-embedder-rank for fine-tuning any encoder(-decoder) model for multi-lingual similarity via softmax-margin loss - Add --logical-epoch that allows to redefine the displayed epoch counter as a multiple of n data epochs, updates or labels. Also allows to define width of fractional part with second argument. - Add --metrics chrf for computing ChrF according to https://www.aclweb.org/anthology/W15-3049/ and SacreBLEU reference implementation @@ -56,6 +59,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fix the runtime failures for FASTOPT on 32-bit builds (wasm just happens to be 32-bit) because it uses hashing with an inconsistent mix of uint64_t and size_t. ### Changed +- Remove `--clip-gemm` which is obsolete and was never used anyway +- Removed `--optimize` switch, instead we now determine compute type based on binary model. - Updated SentencePiece repository to version 8336bbd0c1cfba02a879afe625bf1ddaf7cd93c5 from https://github.com/google/sentencepiece. - Enabled compilation of SentencePiece by default since no dependency on protobuf anymore. - Changed default value of --sentencepiece-max-lines from 10000000 to 2000000 since apparently the new version doesn't sample automatically anymore (Not quite clear how that affects quality of the vocabulary). diff --git a/CMakeLists.txt b/CMakeLists.txt index 343a3e3b0..aa43708c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -93,8 +93,8 @@ if(MSVC) # Or maybe use these? set(INTRINSICS "/arch:AVX2") # set(INTRINSICS "/arch:AVX512") - - set(CMAKE_CXX_FLAGS "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS ${DISABLE_GLOBALLY}") + # /bigobj is necessary for expression_operators.cpp. See https://stackoverflow.com/questions/15110580/penalty-of-the-msvs-compiler-flag-bigobj + set(CMAKE_CXX_FLAGS "/EHsc /DWIN32 /D_WINDOWS /DUNICODE /D_UNICODE /D_CRT_NONSTDC_NO_WARNINGS /D_CRT_SECURE_NO_WARNINGS /bigobj ${DISABLE_GLOBALLY}") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} /MT /O2 ${INTRINSICS} /Zi /MP /GL /DNDEBUG") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} /MTd /Od /Ob0 ${INTRINSICS} /RTC1 /Zi /D_DEBUG") @@ -438,6 +438,7 @@ endif(USE_MPI) ############################################################################### # Find BLAS library if(COMPILE_CPU) + set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU if(USE_APPLE_ACCELERATE) if(NOT APPLE) message(FATAL_ERROR "FATAL ERROR: Apple Accelerate only works on macOS.") diff --git a/regression-tests b/regression-tests index 16914ae94..97b2f95ab 160000 --- a/regression-tests +++ b/regression-tests @@ -1 +1 @@ -Subproject commit 16914ae94c80f338c678f0461c4e45965149f6aa +Subproject commit 97b2f95abab6134c1632b286e373e513ecc52020 diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index 4013d2607..0b81e436c 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -8,6 +8,11 @@ add_subdirectory(./zlib) add_subdirectory(./faiss) include_directories(./faiss) +if(COMPILE_CPU) + set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests") + add_subdirectory(./intgemm) +endif(COMPILE_CPU) + if(USE_FBGEMM) # @TODO: find out if this is somehow harmful. This is supppressing CMake warnings for CMAKE_SUPPRESS_DEVELOPER_WARNINGS # meant to silence CMakeFiles of 3rd_party tools. diff --git a/src/3rd_party/intgemm b/src/3rd_party/intgemm new file mode 160000 index 000000000..874ceebbf --- /dev/null +++ b/src/3rd_party/intgemm @@ -0,0 +1 @@ +Subproject commit 874ceebbf53a85691b326495100b6361a2166cec diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6dcf7fd89..64112ffe7 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,6 +5,8 @@ include_directories(3rd_party) include_directories(3rd_party/SQLiteCpp/include) include_directories(3rd_party/sentencepiece) include_directories(3rd_party/fbgemm/include) +include_directories(3rd_party/intgemm) +include_directories(${CMAKE_BINARY_DIR}/src/3rd_party/intgemm) # running cmake on the intgemm submodule triggers config file generation in this directory. include_directories(${CMAKE_BINARY_DIR}/local/include) set(MARIAN_SOURCES @@ -41,6 +43,7 @@ set(MARIAN_SOURCES 3rd_party/cnpy/cnpy.cpp 3rd_party/ExceptionWithCallStack.cpp + 3rd_party/onnx/protobuf/onnx-ml.pb-wrapper.cpp 3rd_party/phf/phf.cc @@ -52,10 +55,7 @@ set(MARIAN_SOURCES tensors/cpu/prod.cpp tensors/cpu/topk.cpp tensors/cpu/tensor_operators.cpp - - tensors/cpu/sharp/int_gemm.cpp - tensors/cpu/sharp/avx_gemm.cpp - tensors/cpu/sharp/sse_gemm.cpp + tensors/cpu/integer_common.cpp tensors/cpu/fbgemm/packed_gemm.cpp graph/expression_graph.cpp diff --git a/src/command/marian_conv.cpp b/src/command/marian_conv.cpp index 9bdcfb405..26cac858f 100644 --- a/src/command/marian_conv.cpp +++ b/src/command/marian_conv.cpp @@ -1,12 +1,10 @@ #include "marian.h" - #include "common/cli_wrapper.h" +#include "tensors/cpu/expression_graph_packable.h" +#include "onnx/expression_graph_onnx_exporter.h" #include -#include "tensors/cpu/fbgemm/expression_graph_packable.h" -#include "onnx/expression_graph_onnx_exporter.h" - int main(int argc, char** argv) { using namespace marian; @@ -24,7 +22,9 @@ int main(int argc, char** argv) { cli->add("--from,-f", "Input model", "model.npz"); cli->add("--to,-t", "Output model", "model.bin"); cli->add("--export-as", "Kind of conversion: marian-bin or onnx-{encode,decoder-step,decoder-init,decoder-stop}", "marian-bin"); - cli->add("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8avx2, packed8avx512", "float32"); + cli->add("--gemm-type,-g", "GEMM Type to be used: float32, packed16, packed8avx2, packed8avx512, " + "intgemm8, intgemm8ssse3, intgemm8avx2, intgemm8avx512, intgemm16, intgemm16sse2, intgemm16avx2, intgemm16avx512", + "float32"); cli->add>("--vocabs,-V", "Vocabulary file, required for ONNX export"); cli->parse(argc, argv); options->merge(config); @@ -35,19 +35,8 @@ int main(int argc, char** argv) { auto exportAs = options->get("export-as"); auto vocabPaths = options->get>("vocabs");// , std::vector()); - auto saveGemmTypeStr = options->get("gemm-type", "float32"); - Type saveGemmType; - if(saveGemmTypeStr == "float32") { - saveGemmType = Type::float32; - } else if(saveGemmTypeStr == "packed16") { // packed16 only supports AVX2. AVX512 might be added later - saveGemmType = Type::packed16; - } else if(saveGemmTypeStr == "packed8avx2") { // packed8 for AVX2 - saveGemmType = Type::packed8avx2; - } else if(saveGemmTypeStr == "packed8avx512") { // packed8 for AVX512 - saveGemmType = Type::packed8avx512; - } else { - ABORT("Unknown gemm-type: {}", saveGemmTypeStr); - } + // We accept any type here and will later croak during packAndSave if the type cannot be used for conversion + Type saveGemmType = typeFromString(options->get("gemm-type", "float32")); LOG(info, "Outputting {}, precision: {}", modelTo, saveGemmType); @@ -58,12 +47,11 @@ int main(int argc, char** argv) { auto load = [&](Ptr graph) { graph->setDevice(CPU0); - graph->getBackend()->setOptimized(false); - graph->load(modelFrom); graph->forward(); // run the initializers }; + if (exportAs == "marian-bin") { auto graph = New(); load(graph); diff --git a/src/common/binary.cpp b/src/common/binary.cpp index 1531fed6c..60f651f2d 100644 --- a/src/common/binary.cpp +++ b/src/common/binary.cpp @@ -3,6 +3,7 @@ #include "common/file_stream.h" #include "common/io_item.h" #include "common/types.h" +#include "tensors/cpu/integer_common.h" #include @@ -57,13 +58,31 @@ void loadItems(const void* current, std::vector& items, bool mapped) { get(current, offset); for(int i = 0; i < numHeaders; ++i) { + // For intgemm AVX512 and AVX512VNNI have the same arangement, but the VNNI algorithm is faster. + // Change the type to the fastest one supported. + if (items[i].type == Type::intgemm8avx512) { + items[i].type = cpu::integer::getIntgemmType(Type::intgemm8); + } if(items[i].mapped) { // memory-mapped, hence only set pointer + // @TOOD: verify this actually works for the hardware-specific ones like intgemm8avx2 + ABORT_IF(items[i].type == Type::intgemm8 || items[i].type == Type::intgemm16, "mmap format not supported for hardware non-specific intgemm matrices"); items[i].ptr = get(current, headers[i].dataLength); } else { // reading into item data size_t len = headers[i].dataLength; items[i].bytes.resize(len); const char* ptr = get(current, len); - std::copy(ptr, ptr + len, items[i].bytes.begin()); + // Intgemm8/16 matrices in binary model are just quantized, however they also need to be reordered + // Reordering depends on the architecture (SSE/AVX2/AVX512) so we read in the quantized matrices and + // then reorder them before adding them as a parameter in the graph. + if (matchType(items[i].type)) { + items[i].type = cpu::integer::getIntgemmType(Type::intgemm8); + cpu::integer::prepareAndTransposeB(items[i], ptr); + } else if (matchType(items[i].type)) { + items[i].type = cpu::integer::getIntgemmType(Type::intgemm16); + cpu::integer::prepareAndTransposeB(items[i], ptr); + } else { + std::copy(ptr, ptr + len, items[i].bytes.begin()); + } } } } diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index f72475f64..1c2de918f 100755 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -134,8 +134,6 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) { "Suppress logging for translation"); cli.add("--seed", "Seed for all random number generators. 0 means initialize randomly"); - cli.add("--clip-gemm", - "If not 0 clip GEMM input values to +/- arg"); cli.add("--interpolate-env-vars", "allow the use of environment variables in paths, of the form ${VAR_NAME}"); cli.add("--relative-paths", @@ -671,15 +669,13 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) { addSuboptionsDevices(cli); addSuboptionsBatching(cli); - cli.add("--optimize", - "Optimize speed aggressively sacrificing memory or precision"); - cli.add("--skip-cost", - "Ignore model cost during translation, not recommended for beam-size > 1"); cli.add("--fp16", "Shortcut for mixed precision inference with float16, corresponds to: --precision float16"); cli.add>("--precision", "Mixed precision for inference, set parameter type in expression graph", {"float32"}); + cli.add("--skip-cost", + "Ignore model cost during translation, not recommended for beam-size > 1"); cli.add>("--shortlist", "Use softmax shortlist: path first best prune"); @@ -737,8 +733,6 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) { addSuboptionsDevices(cli); addSuboptionsBatching(cli); - cli.add("--optimize", - "Optimize speed aggressively sacrificing memory or precision"); cli.add("--fp16", "Shortcut for mixed precision inference with float16, corresponds to: --precision float16"); cli.add>("--precision", @@ -776,12 +770,10 @@ void ConfigParser::addOptionsEmbedding(cli::CLIWrapper& cli) { addSuboptionsDevices(cli); addSuboptionsBatching(cli); - cli.add("--optimize", - "Optimize speed aggressively sacrificing memory or precision"); cli.add("--fp16", "Shortcut for mixed precision inference with float16, corresponds to: --precision float16"); cli.add>("--precision", - "Mixed precision for inference, set parameter type in expression graph", + "Mixed precision for inference, set parameter type in expression graph. Supported values: float32, float16", {"float32"}); cli.switchGroup(previous_group); @@ -934,7 +926,6 @@ void ConfigParser::addSuboptionsQuantization(cli::CLIWrapper& cli) { // clang-format on } - cli::mode ConfigParser::getMode() const { return mode_; } Ptr ConfigParser::parseOptions(int argc, char** argv, bool doValidate) { diff --git a/src/common/types.cpp b/src/common/types.cpp index f358cdb62..76cc13f06 100644 --- a/src/common/types.cpp +++ b/src/common/types.cpp @@ -26,13 +26,16 @@ size_t requiredBytes(const Shape& shape, Type type) { ABORT("Not a supported data type: {}", type); return 0; } + } +#endif // USE_FBGEMM + + if (isIntgemm(type)) { + /* Intgemm tensors have an extra float at the back that stores the quantization multiplier */ + return shape.elements() * sizeOf(type) + sizeOf(Type::float32); } else { return shape.elements() * sizeOf(type); } -#else - return shape.elements() * sizeOf(type); -#endif // USE_FBGEMM } -} \ No newline at end of file +} // namespace marian \ No newline at end of file diff --git a/src/common/types.h b/src/common/types.h index 4bc4f9ad0..0f70bb228 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -135,20 +135,26 @@ do { \ namespace marian { // small struct to enable templating based on types use for packing -struct packed16 { - uint16_t x; -}; +struct packed16 { uint16_t x; }; // small struct to enable templating based on types use for packing. This is a memory holder. // There's no difference between packed8avx2 and packed8avx512. But, they are separately defined to be distinguished. -struct packed8avx2 { - uint8_t x; -}; +struct packed8avx2 { uint8_t x; }; +struct packed8avx512 { uint8_t x; }; + +// similar to the packed16, but to use with 16bit intgemm model packing. +struct intgemm16 { int16_t x; }; +struct intgemm16sse2 { int16_t x; }; +struct intgemm16avx2 { int16_t x; }; +struct intgemm16avx512 { int16_t x; }; + +// similar to packed8* but for intgemm 8bit model packing. +struct intgemm8 { int8_t x; }; +struct intgemm8ssse3 { int8_t x; }; +struct intgemm8avx2 { int8_t x; }; +struct intgemm8avx512 { int8_t x; }; +struct intgemm8avx512vnni { int8_t x; }; -// small struct to enable templating based on types use for packing. This is a memory holder. -struct packed8avx512 { - uint8_t x; -}; #ifndef __CUDACC__ // vectorized types not available from .cu files @@ -214,17 +220,22 @@ struct float32x8 { #endif // Internal to types.h, don't use. Use test functions below. -enum class TypeClass : size_t { - signed_type = 0x0100, - unsigned_type = 0x0200, - float_type = 0x0400, - - packed_type = 0x0800, // special packed (CPU cache friendly) type class, used in FBGEMM, not meant to be used anywhere else - avx2_type = 0x1000, // processor-specific layout for avx2, currently used for FBGEMM only - avx512_type = 0x2000, // processor-specific layout for avx512, currently used for FBGEMM only - - size_mask = 0x00FF, - class_mask = 0xFF00 +enum class TypeClass : size_t { // size_type has 8 bytes, so we can have 16 fields here, currently using 5. Extend to the left for back-compat. + // built-in type classes + signed_type = 0x00100, + unsigned_type = 0x00200, + float_type = 0x00400, + + avx2_type = 0x01000, // processor-specific layout for avx2, currently used for FBGEMM only (keep 0x1000 for back-compat) + avx512_type = 0x02000, // processor-specific layout for avx512, currently used for FBGEMM only (keep 0x2000 for back-compat) + sse2_type = 0x04000, // processor-specific layout for sse2, currently used for Intgemm only + ssse3_type = 0x08000, // processor-specific layout for ssse3, currently used for Intgemm only + + packed_type = 0x00800, // special packed (CPU cache friendly) type class, used in FBGEMM. Annoyingly we need to keep 0x800 for back-compat, would be nicer to align with intgemm + intgemm_type = 0x10000, // intgemm quantized architecture agnostic models + + size_mask = 0x000FF, // maximum allowed size is 256 bytes right now; if more are required, extend the size field + class_mask = 0xFFF00, // three fields for different type classes, if more classes are added we need to increase the number of fields here }; constexpr inline size_t operator+(TypeClass typeClass, size_t val) { @@ -251,10 +262,21 @@ enum class Type : size_t { float32 = TypeClass::float_type + 4u, float64 = TypeClass::float_type + 8u, - packed16 = TypeClass::packed_type + 2u, // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless. - packed8avx2 = TypeClass::packed_type + 1u + TypeClass::avx2_type, // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless. - packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless. + packed16 = TypeClass::packed_type + 2u, // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless. + packed8avx2 = TypeClass::packed_type + 1u + TypeClass::avx2_type, // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless. + packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless. + + intgemm8 = TypeClass::intgemm_type + 1u, // Int8 quantized (not packed) matrices for intgemm + intgemm16 = TypeClass::intgemm_type + 2u, // Int16 quantized (not packed) matrices for intgemm + intgemm8ssse3 = TypeClass::intgemm_type + 1u + TypeClass::ssse3_type, // Int8 quantized and packed (ssse3) matrices for intgemm + intgemm8avx2 = TypeClass::intgemm_type + 1u + TypeClass::avx2_type, // Int8 quantized and packed (avx2) matrices for intgemm + intgemm8avx512 = TypeClass::intgemm_type + 1u + TypeClass::avx512_type, // Int8 quantized and packed (avx512) matrices for intgemm + intgemm8avx512vnni = TypeClass::intgemm_type + 1u + TypeClass::avx512_type + 4096u, // Int8 quantized and packed (avx512) matrices for intgemm. VNNI algorithm + + intgemm16sse2 = TypeClass::intgemm_type + 2u + TypeClass::sse2_type, // Int16 quantized and packed (sse2) matrices for intgemm + intgemm16avx2 = TypeClass::intgemm_type + 2u + TypeClass::avx2_type, // Int16 quantized and packed (avx2) matrices for intgemm + intgemm16avx512 = TypeClass::intgemm_type + 2u + TypeClass::avx512_type, // Int16 quantized and packed (avx512) matrices for intgemm }; static inline size_t operator&(TypeClass typeClass, Type type) { @@ -289,6 +311,14 @@ static inline bool isPacked(Type type) { return (TypeClass::packed_type & type) != 0; } +static inline bool isSse2(Type type) { + return (TypeClass::sse2_type & type) != 0; +} + +static inline bool isSsse3(Type type) { + return (TypeClass::ssse3_type & type) != 0; +} + static inline bool isAvx2(Type type) { return (TypeClass::avx2_type & type) != 0; } @@ -297,6 +327,10 @@ static inline bool isAvx512(Type type) { return (TypeClass::avx512_type & type) != 0; } +static inline bool isIntgemm(Type type) { + return (TypeClass::intgemm_type & type) != 0; +} + size_t requiredBytes(const Shape& shape, Type type); // towards Frank's vision of joint Shape/Type template @@ -314,13 +348,24 @@ template <> inline bool matchType(Type type) { return type == Type::ui template <> inline bool matchType(Type type) { return type == Type::uint32; } template <> inline bool matchType(Type type) { return type == Type::uint64; } -template <> inline bool matchType(Type type) { return type == Type::float16; } -template <> inline bool matchType(Type type) { return type == Type::float32; } -template <> inline bool matchType(Type type) { return type == Type::float64; } +template <> inline bool matchType(Type type) { return type == Type::float16; } +template <> inline bool matchType(Type type) { return type == Type::float32; } +template <> inline bool matchType(Type type) { return type == Type::float64; } + +template <> inline bool matchType(Type type) { return type == Type::packed16; } +template <> inline bool matchType(Type type) { return type == Type::packed8avx2; } +template <> inline bool matchType(Type type) { return type == Type::packed8avx512; } -template <> inline bool matchType(Type type) { return type == Type::packed16; } -template <> inline bool matchType(Type type) { return type == Type::packed8avx2; } -template <> inline bool matchType(Type type) { return type == Type::packed8avx512; } +template <> inline bool matchType(Type type) { return type == Type::intgemm8; } +template <> inline bool matchType(Type type) { return type == Type::intgemm8ssse3; } +template <> inline bool matchType(Type type) { return type == Type::intgemm8avx2; } +template <> inline bool matchType(Type type) { return type == Type::intgemm8avx512; } +template <> inline bool matchType(Type type) { return type == Type::intgemm8avx512vnni; } + +template <> inline bool matchType(Type type) { return type == Type::intgemm16; } +template <> inline bool matchType(Type type) { return type == Type::intgemm16sse2; } +template <> inline bool matchType(Type type) { return type == Type::intgemm16avx2; } +template <> inline bool matchType(Type type) { return type == Type::intgemm16avx512; } // clang-format on static inline std::ostream& operator<<(std::ostream& out, Type type) { @@ -342,6 +387,16 @@ static inline std::ostream& operator<<(std::ostream& out, Type type) { case Type::packed16 : out << "packed16"; break; case Type::packed8avx2 : out << "packed8avx2"; break; case Type::packed8avx512 : out << "packed8avx512"; break; + + case Type::intgemm8 : out << "intgemm8"; break; + case Type::intgemm8ssse3 : out << "intgemm8ssse3"; break; + case Type::intgemm8avx2 : out << "intgemm8avx2"; break; + case Type::intgemm8avx512 : out << "intgemm8avx512"; break; + case Type::intgemm8avx512vnni : out << "intgemm8avx512vnni"; break; + case Type::intgemm16 : out << "intgemm16"; break; + case Type::intgemm16sse2 : out << "intgemm16sse2"; break; + case Type::intgemm16avx2 : out << "intgemm16avx2"; break; + case Type::intgemm16avx512 : out << "intgemm16avx512"; break; } return out; } @@ -350,12 +405,12 @@ template inline std::string request(); // clang-format off -template <> inline std::string request() { return "int8"; } +template <> inline std::string request() { return "int8"; } template <> inline std::string request() { return "int16"; } template <> inline std::string request() { return "int32"; } template <> inline std::string request() { return "int64"; } -template <> inline std::string request() { return "uint8"; } +template <> inline std::string request() { return "uint8"; } template <> inline std::string request() { return "uint16"; } template <> inline std::string request() { return "uint32"; } template <> inline std::string request() { return "uint64"; } @@ -364,9 +419,19 @@ template <> inline std::string request() { return "float16"; } template <> inline std::string request() { return "float32"; } template <> inline std::string request() { return "float64"; } -template <> inline std::string request() { return "packed16"; } -template <> inline std::string request() { return "packed8avx2"; } -template <> inline std::string request() { return "packed8avx512"; } +template <> inline std::string request() { return "packed16"; } +template <> inline std::string request() { return "packed8avx2"; } +template <> inline std::string request() { return "packed8avx512"; } + +template <> inline std::string request() { return "intgemm8"; } +template <> inline std::string request() { return "intgemm8ssse3"; } +template <> inline std::string request() { return "intgemm8avx2"; } +template <> inline std::string request() { return "intgemm8avx512"; } +template <> inline std::string request() { return "intgemm8avx512vnni"; } +template <> inline std::string request() { return "intgemm16"; } +template <> inline std::string request() { return "intgemm16sse2"; } +template <> inline std::string request() { return "intgemm16avx2"; } +template <> inline std::string request() { return "intgemm16avx512"; } // clang-format on static Type inline typeFromString(const std::string& str) { @@ -402,18 +467,38 @@ static Type inline typeFromString(const std::string& str) { if(str == "packed8avx512") return Type::packed8avx512; + if(str == "intgemm8") + return Type::intgemm8; + if(str == "intgemm8ssse3") + return Type::intgemm8ssse3; + if(str == "intgemm8avx2") + return Type::intgemm8avx2; + if(str == "intgemm8avx512") + return Type::intgemm8avx512; + if(str == "intgemm8avx512vnni") + return Type::intgemm8avx512vnni; + + if(str == "intgemm16") + return Type::intgemm16; + if(str == "intgemm16sse2") + return Type::intgemm16sse2; + if(str == "intgemm16avx2") + return Type::intgemm16avx2; + if(str == "intgemm16avx512") + return Type::intgemm16avx512; + ABORT("Unknown type {}", str); } template inline Type typeId(); -template <> inline Type typeId() { return Type::int8; } +template <> inline Type typeId() { return Type::int8; } template <> inline Type typeId() { return Type::int16; } template <> inline Type typeId() { return Type::int32; } template <> inline Type typeId() { return Type::int64; } -template <> inline Type typeId() { return Type::uint8; } +template <> inline Type typeId() { return Type::uint8; } template <> inline Type typeId() { return Type::uint16; } template <> inline Type typeId() { return Type::uint32; } template <> inline Type typeId() { return Type::uint64; } @@ -422,10 +507,21 @@ template <> inline Type typeId() { return Type::float16; } template <> inline Type typeId() { return Type::float32; } template <> inline Type typeId() { return Type::float64; } -template <> inline Type typeId() { return Type::packed16; } -template <> inline Type typeId() { return Type::packed8avx2; } +template <> inline Type typeId() { return Type::packed16; } +template <> inline Type typeId() { return Type::packed8avx2; } template <> inline Type typeId() { return Type::packed8avx512; } +template <> inline Type typeId() { return Type::intgemm8; } +template <> inline Type typeId() { return Type::intgemm8ssse3; } +template <> inline Type typeId() { return Type::intgemm8avx2; } +template <> inline Type typeId() { return Type::intgemm8avx512; } +template <> inline Type typeId() { return Type::intgemm8avx512vnni; } +template <> inline Type typeId() { return Type::intgemm16; } +template <> inline Type typeId() { return Type::intgemm16sse2; } +template <> inline Type typeId() { return Type::intgemm16avx2; } +template <> inline Type typeId() { return Type::intgemm16avx512; } + + // Abort if given C++ does not correspond to runtime type template void matchOrAbort(Type type) { diff --git a/src/data/shortlist.h b/src/data/shortlist.h index 78cbccea2..395bcfee7 100644 --- a/src/data/shortlist.h +++ b/src/data/shortlist.h @@ -260,6 +260,14 @@ class LexicalShortlistGenerator : public ShortlistGenerator { for(auto& it : data_[i]) indexSet.insert(it.first); } + // Ensure that the generated vocabulary items from a shortlist are a multiple-of-eight + // This is necessary until intgemm supports non-multiple-of-eight matrices. + // TODO better solution here? This could potentially be slow. + WordIndex i = static_cast(firstNum_); + while (indexSet.size() % 8 != 0) { + indexSet.insert(i); + i++; + } // turn into vector and sort (selected indices) std::vector indices(indexSet.begin(), indexSet.end()); diff --git a/src/embedder/embedder.h b/src/embedder/embedder.h index 65807a51b..f2e4a10c0 100644 --- a/src/embedder/embedder.h +++ b/src/embedder/embedder.h @@ -84,11 +84,6 @@ class Embed : public ModelTask { auto precison = options_->get>("precision", {"float32"}); graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph graph->setDevice(device); - graph->getBackend()->setClip(options_->get("clip-gemm")); - if (device.type == DeviceType::cpu) { - graph->getBackend()->setOptimized(options_->get("optimize")); - } - graph->reserveWorkspaceMB(options_->get("workspace")); graphs_.push_back(graph); } diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index f571f9f8b..3d42c6002 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -7,7 +7,7 @@ #include "graph/node_operators_tuple.h" #include "graph/auto_tuner.h" -#include "tensors/cpu/int16.h" +#include "tensors/cpu/intgemm_interface.h" #include "tensors/cpu/fbgemm/expanded_gemm.h" #if USE_FBGEMM @@ -466,7 +466,6 @@ Expr weighted_average(Expr in, Expr weights, int ax) { Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) { auto device = a->graph()->getDeviceId().type; - float clipValue = a->graph()->getBackend()->getClip(); // added support for packed GEMM API (fp16, int8) Type aElementType = a->value_type(); Type bElementType = b->value_type(); @@ -475,18 +474,9 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) { // --optimize --cpu-thread=N with N > 0 are set. if(device == DeviceType::cpu) { if(isFloat(aElementType) && isFloat(bElementType)) { - if(a->graph()->getBackend()->isOptimized()) { - // dotInt16 computes A * B.T, hence the transpose for B to get A * B - // if transA = false and transB = false. - - return cpu::int16::dot( - cpu::int16::quantize(transA ? transpose(a) : a, clipValue), - cpu::int16::quantize(transB ? b : transpose(b), clipValue), - scale); - } else { - return Expression( - clip(a, clipValue), clip(b, clipValue), transA, transB, scale); - } + return Expression(a, b, transA, transB, scale); + } else if(isFloat(aElementType) && isIntgemm(bElementType)) { + return cpu::integer::affineOrDot(a, b, nullptr, transA, transB, scale); } else if(isFloat(aElementType) && isPacked(bElementType)) { #if USE_FBGEMM // 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2 @@ -496,7 +486,7 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) { // and this cpu lookup is executed only once and the state is kept in FBGEMM. if(fbgemm::fbgemmHasAvx2Support()) { // This variant of dot product can handle matrix multiplications with packed8 and packed16 weight matrix (B). - return cpu::variant::dot(clip(a, clipValue), + return cpu::variant::dot(a, b, b->shape(), transA, @@ -512,8 +502,7 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) { ABORT("Combination of types A: {} B: {} not supported", aElementType, bElementType); } } else { - return Expression( - clip(a, clipValue), clip(b, clipValue), transA, transB, scale); + return Expression(a, b, transA, transB, scale); } } @@ -524,16 +513,9 @@ Expr bdot(Expr a, Expr b, bool transA, bool transB, float scale) { static Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { // general version, MKL, CBlas or CUDA - // if clipValue > 0, the inputs will be clipped to range [-clipValue, - // clipValue] This is meant to keep values at the same range as used during - // training when optimizing for 8-bit integer products. Likely to be removed - // in the future when we explore better ways to handle this. - float clipValue = a->graph()->getBackend()->getClip(); - int rows = a->shape().elements() / a->shape()[-1]; Expr ones = a->graph()->ones({ rows, 1 }); - std::vector nodes - = { clip(a, clipValue), clip(b, clipValue), bias, ones }; + std::vector nodes = { a, b, bias, ones }; return Expression(nodes, transA, transB, scale); } @@ -545,22 +527,14 @@ static Expr affineDefault(Expr a, Expr b, Expr bias, bool transA, bool transB, f Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { auto device = a->graph()->getDeviceId().type; - float clipValue = a->graph()->getBackend()->getClip(); Type aElementType = a->value_type(); Type bElementType = b->value_type(); if(device == DeviceType::cpu) { if(isFloat(aElementType) && isFloat(bElementType)) { - if(a->graph()->getBackend()->isOptimized()) { - // cpu int16 version - return cpu::int16::affine( - cpu::int16::quantize(transA ? transpose(a) : a, clipValue), - cpu::int16::quantize(transB ? b : transpose(b), clipValue), - bias, - scale); - } else { - return affineDefault(a, b, bias, transA, transB, scale); - } + return affineDefault(a, b, bias, transA, transB, scale); + } else if(isFloat(aElementType) && isIntgemm(bElementType)) { + return cpu::integer::affineOrDot(a, b, bias, transA, transB, scale); } else if(isFloat(aElementType) && isPacked(bElementType)) { #if USE_FBGEMM // 07/10/2019 - Use packed GEMM only if the cpu architecture supports AVX2 @@ -570,7 +544,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { // and this cpu lookup is executed only once and the state is kept in FBGEMM. if(fbgemm::fbgemmHasAvx2Support()) { // This variant of affine product can handle matrix multiplications with packed8 and packed16 weight matrix (B). - return cpu::variant::affine(clip(a, clipValue), + return cpu::variant::affine(a, b, b->shape(), bias, diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp index 827c89828..4ecdf1697 100755 --- a/src/microsoft/quicksand.cpp +++ b/src/microsoft/quicksand.cpp @@ -10,7 +10,7 @@ #include "translator/scorers.h" #include "data/alignment.h" #include "data/vocab_base.h" -#include "tensors/cpu/fbgemm/expression_graph_packable.h" +#include "tensors/cpu/expression_graph_packable.h" #if USE_FBGEMM #include "fbgemm/Utils.h" @@ -256,7 +256,6 @@ bool convertModel(std::string inputFile, std::string outputFile, int32_t targetP auto graph = New(); graph->setDevice(CPU0); - graph->getBackend()->setOptimized(false); graph->load(inputFile); graph->forward(); diff --git a/src/rescorer/rescorer.h b/src/rescorer/rescorer.h index 65644ccce..af0a16066 100644 --- a/src/rescorer/rescorer.h +++ b/src/rescorer/rescorer.h @@ -73,11 +73,6 @@ class Rescore : public ModelTask { auto precison = options_->get>("precision", {"float32"}); graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph graph->setDevice(device); - graph->getBackend()->setClip(options_->get("clip-gemm")); - if (device.type == DeviceType::cpu) { - graph->getBackend()->setOptimized(options_->get("optimize")); - } - graph->reserveWorkspaceMB(options_->get("workspace")); graphs_.push_back(graph); } diff --git a/src/tensors/backend.h b/src/tensors/backend.h index ce8a5e603..160b828d3 100644 --- a/src/tensors/backend.h +++ b/src/tensors/backend.h @@ -10,10 +10,7 @@ class Backend { DeviceId deviceId_; size_t seed_; Ptr randomGenerator_; - - // global clipping value for matrix-multiplies, should soon be removed. - float clipValue_{0.f}; - + public: Backend(DeviceId deviceId, size_t seed) : deviceId_(deviceId), seed_(seed), randomGenerator_(createRandomGenerator(seed, deviceId)) {} @@ -24,14 +21,6 @@ class Backend { // for GPU only, calls cudaSetDevice, does nothing on CPU. Maybe change name. virtual void setDevice() = 0; virtual void synchronize() = 0; - - virtual void setClip(float clipValue) { clipValue_ = clipValue; } - float getClip() { return clipValue_; } - - // for CPU, sets to use optimized code for inference. - // for GPU, this is invalid. for gpu, isOptimized() function always returns false. - virtual void setOptimized(bool optimize) = 0; - virtual bool isOptimized() = 0; }; Ptr BackendByDeviceId(DeviceId deviceId, size_t seed); diff --git a/src/tensors/cpu/aligned.h b/src/tensors/cpu/aligned.h new file mode 100644 index 000000000..969e08ae9 --- /dev/null +++ b/src/tensors/cpu/aligned.h @@ -0,0 +1,43 @@ +#pragma once + +#include "common/definitions.h" +#include +#ifdef _WIN32 +#include +#endif + +namespace marian { +namespace cpu { +namespace { + +// allocate function for tensor reserve() below. +// Alignment is needed because we use AVX512 and AVX2 vectors. We should fail if we can't allocate aligned memory. + +#ifdef _WIN32 +void *genericMalloc(size_t alignment, size_t size) { + void *ret = _aligned_malloc(size, alignment); + ABORT_IF(!ret, "Failed to allocate memory on CPU"); + return ret; +} +void genericFree(void *ptr) { + _aligned_free(ptr); +} +#else +// Linux and OS X. There is no fallback to malloc because we need it to be aligned. +void *genericMalloc(size_t alignment, size_t size) { + // On macos, aligned_alloc is available only on c++17 + // Furthermore, it requires that the memory requested is an exact multiple of the alignment, otherwise it fails. + // posix_memalign is available both Mac (Since 2016) and Linux and in both gcc and clang + void *result; + // Error could be detected by return value or just remaining nullptr. + ABORT_IF(posix_memalign(&result, alignment, size), "Failed to allocate memory on CPU"); + return result; +} +void genericFree(void *ptr) { + free(ptr); +} +#endif + +} +} // namespace cpu +} // namespace marian diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h index 74bbf8082..398e24240 100644 --- a/src/tensors/cpu/backend.h +++ b/src/tensors/cpu/backend.h @@ -10,17 +10,11 @@ namespace marian { namespace cpu { class Backend : public marian::Backend { -protected: - bool optimized_{false}; - public: Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {} void setDevice() override {} void synchronize() override {} - - // for CPU & inference only, sets to use optimized code for inference. Does nothing for GPU. - void setOptimized(bool optimize) override { optimized_ = optimize; } - bool isOptimized() override { return optimized_; } }; + } // namespace cpu } // namespace marian diff --git a/src/tensors/cpu/device.cpp b/src/tensors/cpu/device.cpp index 40bb558b1..fcc63b6a5 100644 --- a/src/tensors/cpu/device.cpp +++ b/src/tensors/cpu/device.cpp @@ -1,44 +1,8 @@ #include "tensors/device.h" +#include "tensors/cpu/aligned.h" #include - -#ifdef _WIN32 -#include -#endif -#include - namespace marian { namespace cpu { -namespace { - -// allocate function for tensor reserve() below. -// Alignment is needed because we use AVX512 and AVX2 vectors. We should fail if we can't allocate aligned memory. - -#ifdef _WIN32 -void *genericMalloc(size_t alignment, size_t size) { - void *ret = _aligned_malloc(size, alignment); - ABORT_IF(!ret, "Failed to allocate memory on CPU"); - return ret; -} -void genericFree(void *ptr) { - _aligned_free(ptr); -} -#else -// Linux and OS X. There is no fallback to malloc because we need it to be aligned. -void *genericMalloc(size_t alignment, size_t size) { - // On macos, aligned_alloc is available only on c++17 - // Furthermore, it requires that the memory requested is an exact multiple of the alignment, otherwise it fails. - // posix_memalign is available both Mac (Since 2016) and Linux and in both gcc and clang - void *result; - // Error could be detected by return value or just remaining nullptr. - ABORT_IF(posix_memalign(&result, alignment, size), "Failed to allocate memory on CPU"); - return result; -} -void genericFree(void *ptr) { - free(ptr); -} -#endif - -} // namespace Device::~Device() { genericFree(data_); diff --git a/src/tensors/cpu/expression_graph_packable.h b/src/tensors/cpu/expression_graph_packable.h new file mode 100644 index 000000000..689aa3b18 --- /dev/null +++ b/src/tensors/cpu/expression_graph_packable.h @@ -0,0 +1,266 @@ +#pragma once + +#include "graph/expression_graph.h" +#include "fbgemm/packed_gemm.h" +#include "tensors/cpu/integer_common.h" + +namespace marian { + namespace cpu { + void Transpose10(marian::Tensor out, const marian::Tensor in); + } +} + +namespace marian { + + +// When FBGEMM based packed GEMM is used, some weight matrices need to be packed offline. +// The decision which weights can be packed or not should be done walking through the graph. +// This requires some more changes, but we temporarily do this just by name ("_W") of the weights. +// And, this introduces a low level packed_gemm.h apis interact with high level graph class. +// So, we make a subclass of ExpressionGraph and put those immature codes in this class. +// We will improve this in the near future. +class ExpressionGraphPackable : public ExpressionGraph { +public: + ExpressionGraphPackable() + : ExpressionGraph( /* inference = */ true) {} // Packable expression graph only supports inference + + virtual ~ExpressionGraphPackable() {} + + // Convert model weights into packed format and save to IO items. + // @TODO: review this + void packAndSave(const std::string& name, const std::string& meta, Type gemmElementType = Type::float32, Type saveElementType = Type::float32) { + std::vector ioItems; + + // sorted by name in std::map + for (auto p : params()->getMap()) { + std::string pName = p.first; + + if (!namespace_.empty()) { + if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::") + pName = pName.substr(namespace_.size() + 2); + } + + Tensor val = p.second->val(); + + // save as packed format + // @TODO Hardcoded to find packable weights + // int8 - all the weights used for affine op and dot op + // fp16 - all the weights used for affine op + if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512) + && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) { +#if USE_FBGEMM + using namespace marian::cpu::variant; + // packing information - size + int nrow; + int ncol; + uint64_t packsize; + + fbgemmPacked8PackInfo(val->shape(), + gemmElementType, + pName.find("Wemb") != std::string::npos, + nrow, + ncol, + packsize); + + auto allocator = New(getBackend()); + + // buffer tensor to save packed matrix + Tensor packedTensor; + allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8); + + //Pack B matrix into int8 + fbgemmPacked8Pack(packedTensor, + val->data(), + gemmElementType, + pName.find("Wemb") != std::string::npos, + nrow, + ncol, + packsize); + io::Item item; + item.name = pName; + item.shape = val->shape(); + item.type = gemmElementType; + + // Use the actual memory as this will be aligned and padded. + // When memory mapping this is required. Shape keeps track of + // tensor size. Saving to *.npz will cut to size. + auto mem = packedTensor->memory(); + item.bytes.resize(mem->size()); + copy(backend_, mem->data(), mem->data() + mem->size(), item.bytes.data()); + + ioItems.emplace_back(std::move(item)); +#else + ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType); +#endif + // fp16 quantization option + } else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) { +#if USE_FBGEMM + using namespace marian::cpu::variant; + + // packing information + int nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol; + uint64_t packsize; + + fbgemmPacked16PackInfo(val->shape(), + false, + nrow, + ncol, + kernel_ncol_blocks, + brow, + bcol, + last_brow, + nbrow, + nbcol, + packsize); + + auto allocator = New(getBackend()); + + Tensor packedTensor; + allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8); + + // fbgemmPacked16Pack + fbgemmPacked16Pack(packedTensor, + val->data(), + false, + nrow, + ncol, + kernel_ncol_blocks, + brow, + bcol, + last_brow, + nbrow, + nbcol, + packsize); + io::Item item; + item.name = pName; + item.shape = val->shape(); + item.type = Type::packed16; + + // Use the actual memory as this will be aligned and padded. + // When memory mapping this is required. Shape keeps track of + // tensor size. Saving to *.npz will cut to size. + auto mem = packedTensor->memory(); + item.bytes.resize(mem->size()); + copy(backend_, mem->data(), mem->data() + mem->size(), item.bytes.data()); + + ioItems.emplace_back(std::move(item)); +#else + ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType); +#endif + } else if (isIntgemm(gemmElementType) && + (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2 /* || pName.find("Wemb") != std::string::npos*/)) { +#if COMPILE_CPU + using cpu::integer::cols; + using cpu::integer::rows; + auto allocator = New(getBackend()); + + Tensor paramMat; //This allocates extra 4 bytes at the end because of gemmElementType + allocator->allocate(paramMat, val->shape(), gemmElementType); + + // Compute QuantMultiplier, compress matrix and store quantMult at the end. + // We need to tranpose first, because of our architecture independet format requiring a transposed matrix + Tensor tmp; + allocator->allocate(tmp, val->shape(), val->type()); + cpu::Transpose10(tmp, val); + + if(sizeOf(gemmElementType) == 1) { // is 8-bit Intgemm type + float quantMult = cpu::integer::computeQuantMult(val); + + // Hardware-specific conversions which allow to implement memory-mapping and avoid conversion at runtime + cpu::integer::passOrAbort(gemmElementType); // Check if the hardware supports the GEMM type + if(isSsse3(gemmElementType)) { + intgemm::ssse3::Kernels8::PrepareBTransposed(tmp->data(), /*input*/ + paramMat->data(), /*output*/ + quantMult, /*Quant Mult*/ + rows(val), + cols(val)); + } else if(isAvx2(gemmElementType)) { + intgemm::avx2::Kernels8::PrepareBTransposed(tmp->data(), /*input*/ + paramMat->data(), /*output*/ + quantMult, /*Quant Mult*/ + rows(val), + cols(val)); + } else if(isAvx512(gemmElementType)) { + intgemm::avx512bw::Kernels8::PrepareBTransposed(tmp->data(), /*input*/ + paramMat->data(), /*output*/ + quantMult, /*Quant Mult*/ + rows(val), + cols(val)); + } else { + ABORT_IF(gemmElementType != Type::intgemm8, "Type {} is not supported", gemmElementType); // shouldn't really happen, but let's make sure + intgemm::Int8::PrepareA(tmp->data(), /*input*/ + paramMat->data(), /*output*/ + quantMult, /*Quant Mult*/ + rows(val), + cols(val)); + } + //Put the quantMult at the back of the tensor + cpu::integer::getQuantMult(paramMat) = quantMult; + + } else if(sizeOf(gemmElementType) == 2) { // is 16-bit Intgemm type + float quantMult = cpu::integer::computeQuantMult(val); + + // Hardware-specific conversions which allow to implement memory-mapping and avoid conversion at runtime + cpu::integer::passOrAbort(gemmElementType); // Check if the hardware supports the GEMM type + if(isSse2(gemmElementType)) { + intgemm::sse2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/ + paramMat->data(), /*output*/ + quantMult, /*Quant Mult*/ + rows(val), + cols(val)); + } else if(isAvx2(gemmElementType)) { + intgemm::avx2::Kernels16::PrepareBTransposed(tmp->data(), /*input*/ + paramMat->data(), /*output*/ + quantMult, /*Quant Mult*/ + rows(val), + cols(val)); + } else if(isAvx512(gemmElementType)) { + intgemm::avx512bw::Kernels16::PrepareBTransposed(tmp->data(), /*input*/ + paramMat->data(), /*output*/ + quantMult, /*Quant Mult*/ + rows(val), + cols(val)); + } else { + ABORT_IF(gemmElementType != Type::intgemm16, "Type {} is not supported", gemmElementType); // shouldn't really happen, but let's make sure + intgemm::Int16::PrepareA(tmp->data(), /*input*/ + paramMat->data(), /*output*/ + quantMult, /*Quant Mult*/ + rows(val), + cols(val)); + } + //Put the quantMult at the back of the tensor + cpu::integer::getQuantMult(paramMat) = quantMult; + + } else { + ABORT("Incorrect Intgemm type size: {}", sizeOf(gemmElementType)); + } + + //Save... Same as the fbgemm case + io::Item item; + item.name = pName; + item.shape = val->shape(); + item.type = gemmElementType; + + auto mem = paramMat->memory(); + item.bytes.resize(mem->size()); + copy(backend_, mem->data(), mem->data() + mem->size(), item.bytes.data()); + ioItems.emplace_back(std::move(item)); +#else + ABORT("Packed type {} only supported when compiled with -DCOMPILE_CPU=on", gemmElementType); +#endif + } else { + ABORT_IF(saveElementType != Type::float32, "We currently do not know how to save matrices as {}", saveElementType); + io::Item item; + val->get(item, pName); + item.convert(saveElementType); + ioItems.emplace_back(std::move(item)); + } + } + + if (!meta.empty()) + io::addMetaToItems(meta, "special:model.yml", ioItems); + io::saveItems(name, ioItems); + } +}; + +} // namespace marian diff --git a/src/tensors/cpu/fbgemm/expanded_gemm.h b/src/tensors/cpu/fbgemm/expanded_gemm.h index a5c93f6bc..fb07bbad5 100644 --- a/src/tensors/cpu/fbgemm/expanded_gemm.h +++ b/src/tensors/cpu/fbgemm/expanded_gemm.h @@ -2,7 +2,7 @@ #include "graph/node.h" #include "packed_gemm.h" -#include "tensors/cpu/sharp/int_gemm.h" +#include "tensors/cpu/integer_common.h" #if USE_FBGEMM #ifdef __GNUC__ @@ -57,14 +57,12 @@ struct FbgemmPacked16PackNodeOp : public UnaryNodeOp { int nbcol_; uint64_t packsize_; - FbgemmPacked16PackNodeOp(Expr a, PackMatrix packMat, bool transpose, float clipValue) + FbgemmPacked16PackNodeOp(Expr a, PackMatrix packMat, bool transpose) : UnaryNodeOp(a, newShape(a, transpose), Type::uint8), packMat_(packMat), transpose_(transpose) { if(packMat != PackMatrix::B) ABORT("Only prepacking of B (weight matrix) is supported"); - if(clipValue != 0) - ABORT("Clipping is not supported"); if(!memoize_) ABORT("Only constant weight node can be packed"); } @@ -144,16 +142,13 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp { FbgemmPacked8PackNodeOp(Expr a, PackMatrix packMat, marian::Type packType, - bool transpose, - float clipValue) + bool transpose) : UnaryNodeOp(a, newShape(a, transpose), Type::uint8), packMat_(packMat), packType_(packType), transpose_(transpose) { if(packMat != PackMatrix::B) ABORT("Only prepacking of B (weight matrix) is supported"); - if(clipValue != 0) - ABORT("Clipping is not supported"); if(!memoize_) ABORT("Only constant weight node can be packed"); } @@ -337,7 +332,7 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp { k_, transA_, transB_); - marian::cpu::int16::AddBias(val_, child(2)->val())) }; + marian::cpu::integer::AddBias(val_, child(2)->val())) }; } else { nodeOps = { NodeOp(fbgemmPacked8Gemm(val_, child(0)->val(), @@ -377,11 +372,11 @@ static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, boo } } -static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float clipValue) { +static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose) { if (elementType == Type::packed16) - return Expression(a, packMat, transpose, clipValue); + return Expression(a, packMat, transpose); else if (isPacked(elementType) && sizeOf(elementType) == 1) - return Expression(a, packMat, elementType, transpose, clipValue); + return Expression(a, packMat, elementType, transpose); else { ABORT("Only int8 and fp16 are available. {}", elementType); return nullptr; diff --git a/src/tensors/cpu/fbgemm/expression_graph_packable.h b/src/tensors/cpu/fbgemm/expression_graph_packable.h deleted file mode 100644 index f5b05c302..000000000 --- a/src/tensors/cpu/fbgemm/expression_graph_packable.h +++ /dev/null @@ -1,156 +0,0 @@ -#pragma once - -#include "graph/expression_graph.h" -#include "packed_gemm.h" - -namespace marian { - -// When FBGEMM based packed GEMM is used, some weight matrices need to be packed offline. -// The decision which weights can be packed or not should be done walking through the graph. -// This requires some more changes, but we temporarily do this just by name ("_W") of the weights. -// And, this introduces a low level packed_gemm.h apis interact with high level graph class. -// So, we make a subclass of ExpressionGraph and put those immature codes in this class. -// We will improve this in the near future. -class ExpressionGraphPackable : public ExpressionGraph { -public: - ExpressionGraphPackable() - : ExpressionGraph( /* inference = */ true) {} // Packable expression graph only supports inference - - virtual ~ExpressionGraphPackable() {} - - // Convert model weights into packed format and save to IO items. - // @TODO: review this - void packAndSave(const std::string& name, const std::string& meta, Type gemmElementType = Type::float32, Type saveElementType = Type::float32) { - std::vector ioItems; - - // sorted by name in std::map - for (auto p : params()->getMap()) { - std::string pName = p.first; - - if (!namespace_.empty()) { - if (pName.substr(0, namespace_.size() + 2) == namespace_ + "::") - pName = pName.substr(namespace_.size() + 2); - } - - Tensor val = p.second->val(); - - // save as packed format - // @TODO Hardcoded to find packable weights - // int8 - all the weights used for affine op and dot op - // fp16 - all the weights used for affine op - if ((gemmElementType == Type::packed8avx2 || gemmElementType == Type::packed8avx512) - && (pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2)) { -#if USE_FBGEMM - using namespace marian::cpu::variant; - // packing information - size - int nrow; - int ncol; - uint64_t packsize; - - fbgemmPacked8PackInfo(val->shape(), - gemmElementType, - pName.find("Wemb") != std::string::npos, - nrow, - ncol, - packsize); - - auto allocator = New(getBackend()); - - // buffer tensor to save packed matrix - Tensor packedTensor; - allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8); - - //Pack B matrix into int8 - fbgemmPacked8Pack(packedTensor, - val->data(), - gemmElementType, - pName.find("Wemb") != std::string::npos, - nrow, - ncol, - packsize); - io::Item item; - item.name = pName; - item.shape = val->shape(); - item.type = gemmElementType; - - // Use the actual memory as this will be aligned and padded. - // When memory mapping this is required. Shape keeps track of - // tensor size. Saving to *.npz will cut to size. - auto mem = packedTensor->memory(); - item.bytes.resize(mem->size()); - copy(backend_, mem->data(), mem->data() + mem->size(), item.bytes.data()); - - ioItems.emplace_back(std::move(item)); -#else - ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType); -#endif - // fp16 quantization option - } else if (gemmElementType == Type::packed16 && pName.find("_W") == pName.length() - 3) { -#if USE_FBGEMM - using namespace marian::cpu::variant; - - // packing information - int nrow, ncol, kernel_ncol_blocks, brow, bcol, last_brow, nbrow, nbcol; - uint64_t packsize; - - fbgemmPacked16PackInfo(val->shape(), - false, - nrow, - ncol, - kernel_ncol_blocks, - brow, - bcol, - last_brow, - nbrow, - nbcol, - packsize); - - auto allocator = New(getBackend()); - - Tensor packedTensor; - allocator->allocate(packedTensor, { 1, (int32_t)packsize }, Type::uint8); - - // fbgemmPacked16Pack - fbgemmPacked16Pack(packedTensor, - val->data(), - false, - nrow, - ncol, - kernel_ncol_blocks, - brow, - bcol, - last_brow, - nbrow, - nbcol, - packsize); - io::Item item; - item.name = pName; - item.shape = val->shape(); - item.type = Type::packed16; - - // Use the actual memory as this will be aligned and padded. - // When memory mapping this is required. Shape keeps track of - // tensor size. Saving to *.npz will cut to size. - auto mem = packedTensor->memory(); - item.bytes.resize(mem->size()); - copy(backend_, mem->data(), mem->data() + mem->size(), item.bytes.data()); - - ioItems.emplace_back(std::move(item)); -#else - ABORT("Packed type {} only supported when compiled with -DUSE_FBGEMM=on", gemmElementType); -#endif - } else { - io::Item item; - val->get(item, pName); - item.convert(saveElementType); - ioItems.emplace_back(std::move(item)); - } - } - - if (!meta.empty()) - io::addMetaToItems(meta, "special:model.yml", ioItems); - io::saveItems(name, ioItems); - } -}; - -} // namespace marian \ No newline at end of file diff --git a/src/tensors/cpu/int16.h b/src/tensors/cpu/int16.h deleted file mode 100644 index f2bdd0a91..000000000 --- a/src/tensors/cpu/int16.h +++ /dev/null @@ -1,113 +0,0 @@ -#pragma once - -#include "graph/node.h" -#include "tensors/cpu/sharp/int_gemm.h" - -namespace marian { -namespace cpu { -namespace int16 { - -struct QuantizeNodeOp : public UnaryNodeOp { - float clipValue_; - - QuantizeNodeOp(Expr a, float clipValue) - : UnaryNodeOp(a, Type::int16), clipValue_{clipValue} {} - - NodeOps forwardOps() override { - return {NodeOp(Quantize16(val_, child(0)->val(), clipValue_))}; - } - - NodeOps backwardOps() override { - ABORT("Only used for inference"); - } - - const std::string type() override { return "quantizeInt16"; } -}; - -class DotNodeOp : public NaryNodeOp { -private: - float scalar_; - -public: - DotNodeOp(Expr a, Expr b, float scalar) - : NaryNodeOp({a, b}, newShape(a, b), Type::float32), scalar_(scalar) {} - - Shape newShape(Expr a, Expr b) { - auto shapeA = a->shape(); - auto shapeB = b->shape(); - - // Computing A * B^T - shapeB.set(-2, b->shape()[-1]); - shapeB.set(-1, b->shape()[-2]); - - Shape outShape = shapeA; - outShape.set(-1, shapeB[-1]); - ABORT_IF(shapeA[-1] != shapeB[-2], - "matrix product requires dimensions to match"); - return outShape; - } - - NodeOps forwardOps() override { - return {NodeOp(ProdInt16(val_, child(0)->val(), child(1)->val(), scalar_))}; - } - - NodeOps backwardOps() override { - ABORT("Only used for inference"); - } - - const std::string type() override { return "dotInt16"; } -}; - -class AffineNodeOp : public NaryNodeOp { -private: - float scalar_; - -public: - AffineNodeOp(const std::vector& nodes, float scalar) - : NaryNodeOp(nodes, newShape(nodes[0], nodes[1]), Type::float32), scalar_(scalar) {} - - Shape newShape(Expr a, Expr b) { - auto shapeA = a->shape(); - auto shapeB = b->shape(); - - // Computing A * B^T - shapeB.set(-2, b->shape()[-1]); - shapeB.set(-1, b->shape()[-2]); - - Shape outShape = shapeA; - outShape.set(-1, shapeB[-1]); - ABORT_IF(shapeA[-1] != shapeB[-2], - "matrix product requires dimensions to match"); - return outShape; - } - - NodeOps forwardOps() override { - return { - NodeOp(ProdInt16(val_, child(0)->val(), child(1)->val(), scalar_); - AddBias(val_, child(2)->val())) - }; - } - - NodeOps backwardOps() override { - ABORT("Only used for inference"); - } - - const std::string type() override { return "affineInt16"; } -}; - -static inline Expr dot(Expr a, Expr b, float scalar) { - return Expression(a, b, scalar); -} - -static inline Expr affine(Expr a, Expr b, Expr c, float scalar) { - std::vector nodes = {a, b, c}; - return Expression(nodes, scalar); -} - -static inline Expr quantize(Expr a, float clipValue) { - return Expression(a, clipValue); -} - -} // namespace int16 -} // namespace cpu -} // namespace marian diff --git a/src/tensors/cpu/integer_common.cpp b/src/tensors/cpu/integer_common.cpp new file mode 100644 index 000000000..6864a86d6 --- /dev/null +++ b/src/tensors/cpu/integer_common.cpp @@ -0,0 +1,45 @@ +#include "integer_common.h" + +namespace marian { +namespace cpu { +namespace integer { +// This operates on floats after processing so doesn't care about int8_t vs int16_t. +void AddBias(marian::Tensor C, const marian::Tensor Bias) { + float* y = C->data(); + const float* x = C->data(); + const float* bias = Bias->data(); + + const int m = C->shape().elements() / C->shape()[-1]; + const int n = C->shape()[-1]; + + for(int j = 0; j < m; ++j) { + int i = 0; +#ifdef __AVX512F__ + int n16 = n & ~15; + for(; i < n16; i += 16) { + __m512 ai = _mm512_loadu_ps(x + j * n + i); + __m512 bi = _mm512_loadu_ps(bias + i); + __m512 yi = _mm512_add_ps(ai, bi); + _mm512_storeu_ps(y + j * n + i, yi); + } +#else + int n4 = (n / 4) * 4; + for(; i < n4; i += 4) { + __m128 ai = _mm_loadu_ps(x + j * n + i); + __m128 bi = _mm_loadu_ps(bias + i); + __m128 yi = _mm_add_ps(ai, bi); + _mm_storeu_ps(y + j * n + i, yi); + } +#endif + for(; i < n; i++) { + y[j * n + i] = x[j * n + i] + bias[i]; + } + } +} + +//template void prepareAndTranspose;//(io::Item& item, const char * input); +//template void prepareAndTranspose(io::Item&, const char *); + +} //integer +} //cpu +} //marian \ No newline at end of file diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h new file mode 100644 index 000000000..6e96f8cdc --- /dev/null +++ b/src/tensors/cpu/integer_common.h @@ -0,0 +1,223 @@ +#pragma once + +#include "tensors/tensor_allocator.h" +#include "tensors/tensor_operators.h" +#include "tensors/cpu/aligned.h" +#include "common/io_item.h" + +#if COMPILE_CPU +#include "3rd_party/intgemm/intgemm/intgemm.h" +#else +namespace intgemm { + struct Int8; + struct Int16; + namespace ssse3 { + struct Kernels8; + } + namespace sse2 { + struct Kernels16; + } + namespace avx2 { + struct Kernels8; + struct Kernels16; + } + namespace avx512bw { + struct Kernels8; + struct Kernels16; + } + namespace avx512vnni { + struct Kernels8; + } +} +#endif + +#include +#include +#include +#include +#include +#include + +namespace marian { +namespace cpu { +namespace integer { + +//Convenient function to get rows and columns of a tensor, shadowed by namespace. +inline int cols(Tensor& tensor) { return tensor->shape()[-1]; } +inline int rows(Tensor& tensor) { return tensor->shape().elements() / cols(tensor); } + +inline int cols(Shape& shape) { return shape[-1]; } +inline int rows(Shape& shape) { return shape.elements() / cols(shape); } + +template struct intgemm_; + +template <> struct intgemm_ { + using width = intgemm::Int8; + using type = int8_t; +}; + +template <> struct intgemm_ { + using width = intgemm::ssse3::Kernels8; + using type = int8_t; +}; + +template <> struct intgemm_ { + using width = intgemm::avx2::Kernels8; + using type = int8_t; +}; + +template <> struct intgemm_ { + using width = intgemm::avx512bw::Kernels8; + using type = int8_t; +}; + +template <> struct intgemm_ { + using width = intgemm::avx512vnni::Kernels8; + using type = int8_t; +}; + +template <> struct intgemm_ { + using width = intgemm::Int16; + using type = int16_t; +}; + +template <> struct intgemm_ { + using width = intgemm::sse2::Kernels16; + using type = int16_t; +}; + +template <> struct intgemm_ { + using width = intgemm::avx2::Kernels16; + using type = int16_t; +}; + +template <> struct intgemm_ { + using width = intgemm::avx512bw::Kernels16; + using type = int16_t; +}; + +template +static inline float& getQuantMult(marian::Tensor val) { +#if COMPILE_CPU + ABORT_IF(!isIntgemm(val->type()), "getQuantMult does not work for type {}", val->type()); + typedef typename intgemm_::type Integer; + return *(reinterpret_cast(val->data() + val->shape().elements())); +#else + val; + ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON."); +#endif +} + +static inline Type getIntgemmType(Type vtype) { +#if COMPILE_CPU + if (vtype == Type::intgemm8) { + if (intgemm::kCPU == intgemm::CPUType::AVX512VNNI) { + return Type::intgemm8avx512vnni; + } else if (intgemm::kCPU == intgemm::CPUType::AVX512BW) { + return Type::intgemm8avx512; + } else if (intgemm::kCPU == intgemm::CPUType::AVX2) { + return Type::intgemm8avx2; + } else if (intgemm::kCPU == intgemm::CPUType::SSSE3) { + return Type::intgemm8ssse3; + } else { + ABORT("Your CPU doesn't support SSSE3, necessary for 8bit intgemm to work."); + } + } else if (vtype == Type::intgemm16) { + if (intgemm::kCPU > intgemm::CPUType::AVX2) { + return Type::intgemm16avx512; + } else if (intgemm::kCPU == intgemm::CPUType::AVX2) { + return Type::intgemm16avx2; + } else if (intgemm::kCPU >= intgemm::CPUType::SSE2) { + return Type::intgemm16sse2; + } else { + ABORT("Your CPU doesn't support SSE2, necessary for 16bit intgemm to work."); + } + } else { + ABORT("Unrecognised type {}.", vtype); + } +#else + ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON."); + return vtype; +#endif +} + +static inline bool passOrAbort(Type vtype) { +#if COMPILE_CPU + if (vtype == Type::intgemm8 || vtype == Type::intgemm16) { + return true; + } else if (vtype == Type::intgemm16sse2) { + ABORT_IF(intgemm::kCPU < intgemm::CPUType::SSE2, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype); + } else if (vtype == Type::intgemm8ssse3) { + ABORT_IF(intgemm::kCPU < intgemm::CPUType::SSSE3, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype); + } else if (vtype == Type::intgemm8avx2 || vtype == Type::intgemm16avx2) { + ABORT_IF(intgemm::kCPU < intgemm::CPUType::AVX2, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype); + } else if (vtype == Type::intgemm8avx512 || vtype == Type::intgemm16avx512) { + ABORT_IF(intgemm::kCPU < intgemm::CPUType::AVX512BW, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype); + } else if (vtype == Type::intgemm8avx512vnni) { + ABORT_IF(intgemm::kCPU < intgemm::CPUType::AVX512VNNI, "Your CPU doesn't support the architecture necessary to decode model of type {}. Try older architecture instead.", vtype); + } + return true; +#else + vtype; + ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON."); + return false; +#endif +} + +template +static inline float computeQuantMult(marian::Tensor val) { +#if COMPILE_CPU + if(sizeOf(vtype) == 1) + return 127.0f / intgemm::MaxAbsolute(val->data(), val->data() + val->shape().elements()); + else if(sizeOf(vtype) == 2) + return 1024.0f; + else + ABORT("Unhandled type size {}", sizeOf(vtype)); +#else + val; + ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON."); +#endif +} + +// This operates on floats after processing so doesn't care about int8_t vs int16_t. +void AddBias(marian::Tensor C, const marian::Tensor Bias); + +// For loading architecture agnostic models. We do PrepareAndTranpose, because we already transposed +// in our binary format. Then we copy the quantizationMultiplier information at the end +template +void prepareAndTransposeB(io::Item& item, const char * input) { +#if COMPILE_CPU + typedef typename intgemm_::type Integer; + Integer * output_tensor = reinterpret_cast(&(*item.bytes.begin())); + // Sometimes we will end up with misaligned intput (and output) so we can't use them directly. + // If this is the case, we will need to temporary allocate aligned memory, copy the results, and then free it + if (reinterpret_cast(input) % 64 == 0 && reinterpret_cast(output_tensor) % 64 == 0) { + intgemm_::width::PrepareBQuantizedTransposed(reinterpret_cast(input), + output_tensor, + rows(item.shape), //Since we only transposed, but didn't update the shape when constructing the binary, + cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows + } else { + Integer * aligned_input = reinterpret_cast(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer))); + std::copy(input, input + rows(item.shape)*cols(item.shape), aligned_input); + Integer * aligned_output = reinterpret_cast(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer))); + intgemm_::width::PrepareBQuantizedTransposed(reinterpret_cast(aligned_input), + reinterpret_cast(aligned_output), + rows(item.shape), //Since we only transposed, but didn't update the shape when constructing the binary, + cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows + // Copy to output tensor + std::copy(aligned_output, aligned_output + rows(item.shape)*cols(item.shape), output_tensor); + genericFree(aligned_input); + genericFree(aligned_output); + } + //Copy the quantMult + float quantMult = *(reinterpret_cast(reinterpret_cast(input) + item.shape.elements())); + *(reinterpret_cast(&(*(output_tensor + item.shape.elements())))) = quantMult; +#else + item, input; + ABORT("Using intgemm binary models is only supported when compiling marian with -DCOMPILE_CPU=ON."); +#endif +} + +} //integer +} //cpu +} //marian \ No newline at end of file diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h new file mode 100644 index 000000000..88408aa18 --- /dev/null +++ b/src/tensors/cpu/intgemm_interface.h @@ -0,0 +1,132 @@ +#pragma once + +#include "graph/node.h" +#include "graph/node_operators_unary.h" +#include "integer_common.h" + +namespace marian { + +namespace cpu { +namespace integer { + +#if COMPILE_CPU +/* + * Prepare an activation matrix into intgemm8/16 format. For now the activation matrix is just quantized. + * Expr input: The input tensor + */ +template +static inline Expr prepareA(Expr a) { + auto nodeOp = [](Expr out, const std::vector& children) { + Expr in = children[0]; + auto quantMult = computeQuantMult(in->val()); + typedef typename intgemm_::type Integer; + intgemm_::width::PrepareA(in->val()->data(), /*input*/ + out->val()->data(), /*output*/ + quantMult, /*Quant Mult*/ + rows(in->val()), + cols(in->val())); + getQuantMult(out->val()) = quantMult; + }; + + return lambda({a}, a->shape(), vtype, nodeOp); +} +#endif + +/* + * This computes A*B (+ bias if available) in intgemm. + * Expr a: The activation matrix in intgemm format + * Expr b: The parameter matrix in intgemm fromat + * Expr bias: The bias + * bool transA - tranpose input A if true + * bool transB - unused here (@TODO remove?) + * float scale - scale the output by `scale` + * the template argument controls whether we're doing 16bit integers or 8bit integers. + * It can be Type::intgemm8 or Type::intgemm16 and all hardware-specific variants + */ +template +static inline Expr affineOrDotTyped(Expr a, Expr bQuant, Expr bias, bool transA, bool /*transB*/, float scale) { +#if COMPILE_CPU + ABORT_IF(!isFloat(a->value_type()), "Intgemm expects type of A to be float32 not {}", a->value_type()); + ABORT_IF(!isIntgemm(bQuant->value_type()), "Intgemm expects type of B to be a variant of intgemm not {}", bQuant->value_type()); + + auto aQuant = prepareA(transA ? transpose(a) : a); // A should not be quantized yet as seen above, hence quantize here + + // determine the output shape m x n for A: m x k and B: k x n + // since we transpose A beforehand we don't need to take care of transposed shapes here + Shape outShape = aQuant->shape(); + outShape.set(-1, bQuant->shape()[-1]); + + // wrap the multiply finctions to be executed in the forward step of a Lambda node + auto dotOrAffineNodeOp = [=](Expr out, const std::vector& children) { + Expr aQuant = children[0]; + Expr bQuant = children[1]; + Expr bias = children.size() > 2 ? children[2] : nullptr; + + // when we arrive here, A and B are already quantized, so just get the multipliers + float aQuantMult = getQuantMult(aQuant->val()); + float bQuantMult = getQuantMult(bQuant->val()); + + float unquant_mult = 1.0f / (aQuantMult * bQuantMult); + unquant_mult = unquant_mult * scale; + + typedef typename intgemm_::type Integer; + if(bias) { // dispatch a multiply with integrated bias addition i.e affine(...) + intgemm_::width::Multiply(/*A=*/aQuant->val()->data(), + /*B=*/bQuant->val()->data(), + rows(aQuant->val()), + cols(aQuant->val()), + cols(bQuant->val()), + intgemm::callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, /*bias=*/bias->val()->data(), /*output=*/out->val()->data())); + } else { // dispatch a multiply without bias addition i.e dot(...) + intgemm_::width::Multiply(/*A=*/aQuant->val()->data(), + /*B=*/bQuant->val()->data(), + rows(aQuant->val()), + cols(aQuant->val()), + cols(bQuant->val()), + intgemm::callbacks::UnquantizeAndWrite(unquant_mult, /*output=*/out->val()->data())); + } + }; + + std::vector children = {aQuant, bQuant}; + if(bias) + children.push_back(bias); + + return lambda(children, outShape, Type::float32, dotOrAffineNodeOp); // inference-only Lambda node +#else + a, bQuant, bias, transA, scale; + ABORT("You need to enable CPU compilation to use this feature. Use cmake .. -DCOMPILE_CPU=ON"); +#endif +} + +// Dispatch correct hardware-agnostic or hardware-specific matrix multiplies +static inline Expr affineOrDot(Expr a, Expr bQuant, Expr bias, bool transA, bool transB, float scale) { + Type bQuantElementType = bQuant->value_type(); + static const bool pass = cpu::integer::passOrAbort(bQuantElementType); + pass; // We declare this variable as static so that passOrAbort is only ever run once during the initialization. + switch(bQuantElementType) { + //case Type::intgemm8 : // The generic case selects CPU automatically, but we set all the types manually anyways. + // return cpu::integer::affineOrDotTyped(a, bQuant, bias, transA, transB, scale); + case Type::intgemm8ssse3 : + return cpu::integer::affineOrDotTyped(a, bQuant, bias, transA, transB, scale); + case Type::intgemm8avx2 : + return cpu::integer::affineOrDotTyped(a, bQuant, bias, transA, transB, scale); + case Type::intgemm8avx512 : + return cpu::integer::affineOrDotTyped(a, bQuant, bias, transA, transB, scale); + case Type::intgemm8avx512vnni : + return cpu::integer::affineOrDotTyped(a, bQuant, bias, transA, transB, scale); + //case Type::intgemm16 : // The generic case selects CPU automatically, but we set all the types manually anyways. + // return cpu::integer::affineOrDotTyped(a, bQuant, bias, transA, transB, scale); + case Type::intgemm16sse2 : + return cpu::integer::affineOrDotTyped(a, bQuant, bias, transA, transB, scale); + case Type::intgemm16avx2 : + return cpu::integer::affineOrDotTyped(a, bQuant, bias, transA, transB, scale); + case Type::intgemm16avx512 : + return cpu::integer::affineOrDotTyped(a, bQuant, bias, transA, transB, scale); + default: + ABORT("Unsupported type {} for Intgemm type??", bQuantElementType); + } +} + +} // namespace integer +} // namespace cpu +} // namespace marian diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp index 8529db8b5..86b87b033 100755 --- a/src/tensors/cpu/prod.cpp +++ b/src/tensors/cpu/prod.cpp @@ -7,8 +7,17 @@ #include "tensors/tensor.h" #include "tensors/tensor_allocator.h" +#if MKL_FOUND +#include +#else +#if BLAS_FOUND +#include +#endif +#endif + +#include "integer_common.h" #include "prod_blas.h" -#include "sharp/int_gemm.h" + namespace marian { @@ -187,7 +196,7 @@ void ProdWithBias(marian::Tensor C, float beta, float scalar) { cpu::Prod(C, A, B, transA, transB, beta, scalar); - cpu::int16::AddBias(C, bias); + cpu::integer::AddBias(C, bias); } void CSRProd(marian::Tensor C, diff --git a/src/tensors/cpu/sharp/avx_gemm.cpp b/src/tensors/cpu/sharp/avx_gemm.cpp deleted file mode 100644 index 61f75fea6..000000000 --- a/src/tensors/cpu/sharp/avx_gemm.cpp +++ /dev/null @@ -1,615 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef __AVX512F__ - -namespace marian { -namespace cpu { -namespace int16 { - -namespace { -// Load from memory, multiply, and convert to int32_t. -inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) { - // Load 16 floats - __m512 val = _mm512_load_ps(input); - // Multiply each by the quantization factor. - val = _mm512_mul_ps(val, quant_mult_reg); - // Cast to 32-bit int - return _mm512_cvtps_epi32(val); -} -} // namespace - -// Convert -void AVX_Quantize16(const float *input, - int16_t *output, - float quant_mult, - std::size_t size) { - assert(size % 16 == 0); - assert(reinterpret_cast(input) % 64 == 0); - // Fill with the quantization multiplier. - const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult); - const float *end = input + size; - for(; input != end; input += 16, output += 16) { - // There doesn't seem to be an unmasked version. - _mm512_mask_cvtsepi32_storeu_epi16( - output, 0xffff, QuantizerGrab(input, quant_mult_reg)); - } -} - -void AVX_Quantize8(const float *input, - int8_t *output, - float quant_mult, - std::size_t size) { - assert(size % 16 == 0); - assert(reinterpret_cast(input) % 64 == 0); - const __m512i neg127 = _mm512_set1_epi32(-127); - const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult); - const float *end = input + size; - for(; input < end; input += 16, output += 16) { - __m512i asint = QuantizerGrab(input, quant_mult_reg); - /* Ban -128. We can't negate it. - * The largest possbile product is -128 * -128 = 2^14. If two of those are - * summed that's 2^15 which is too large for int16_t. By banning -128 we - * can accumulate two in int16_t w/o saturation before going to int32_t. - * But this is ok because apparently the instruction will saturate. - */ - asint = _mm512_max_epi32(asint, neg127); - // There doesn't seem to be an unmasked version. - _mm512_mask_cvtsepi32_storeu_epi8(output, 0xffff, asint); - } -} - -namespace { - -union FloatAccess { - float as_f[4]; - __m128 as_n; -}; -union IntAccess { - int32_t as_i[4]; - __m128i as_n; -}; - -/* Convert 16-bit to 32-bit and add, not caring what parts are added. - * Implementations: - * 1. - * https://github.com/tesseract-ocr/tesseract/blob/master/src/arch/intsimdmatrixavx2.cpp#L67 - * under Apache license: This does a multiply by 1 and horizontal add: - * _mm512_madd_epi16(sum, _mm512_set1_epi16(1)) - * Current fastest. - * - * 2. Signed extension and fold halves: - * sum = _mm512_add_epi32( - * _mm512_cvtepi16_epi32(_mm512_castsi512_si256(sum)), - * _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(sum, 1))); - * - * 3. Sign extend by abuse of bitshift, then add. - * __m128i shift16 = _mm_set_epi32(0,0,0,16); - * sum = _mm512_add_epi32( - * _mm512_sra_epi32(_mm512_sll_epi32(sum, shift16), shift16), - * _mm512_sra_epi32(sum, shift16)); - */ -inline void Convert32Sum(__m512i &sum) { - short one = 1; - sum = _mm512_madd_epi16(sum, _mm512_set1_epi16(one)); -} - -// Two sum version. -struct ReducedPair { - int32_t result[2]; -}; -inline ReducedPair Reduce16to32(__m512i sum1, __m512i sum2) { - Convert32Sum(sum1); - Convert32Sum(sum2); - // 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 - __m512i pack12 = _mm512_add_epi32(_mm512_unpackhi_epi32(sum1, sum2), - _mm512_unpacklo_epi32(sum1, sum2)); - // 1 2 1 2 1 2 1 2 - __m256i halves = _mm256_add_epi32(_mm512_castsi512_si256(pack12), - _mm512_extracti64x4_epi64(pack12, (short)1)); - // 1 2 1 2 - IntAccess a; - a.as_n = _mm_add_epi32(_mm256_castsi256_si128(halves), - _mm256_extracti128_si256(halves, 1)); - ReducedPair ret; - ret.result[0] = a.as_i[0] + a.as_i[2]; - ret.result[1] = a.as_i[1] + a.as_i[3]; - return ret; -} - -// Assuming sum1, sum2, sum3, and sum4 are arrays 32-bit signed integers, -// reduce within each. -// Returns [sum(sum1), sum(sum2), sum(sum3), sum(sum4)] -// TODO: consider doing in 64-bit, allowing 4 more bits of quantization? -inline __m128i Reduce32(__m512i sum1, - __m512i sum2, - __m512i sum3, - __m512i sum4) { - // 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 - __m512i pack12 = _mm512_add_epi32(_mm512_unpackhi_epi32(sum1, sum2), - _mm512_unpacklo_epi32(sum1, sum2)); - // 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 - __m512i pack34 = _mm512_add_epi32(_mm512_unpackhi_epi32(sum3, sum4), - _mm512_unpacklo_epi32(sum3, sum4)); - // 1 2 3 4 1 2 3 4 1 2 3 4 1 2 3 4 - __m512i pack1234 = _mm512_add_epi32(_mm512_unpackhi_epi64(pack12, pack34), - _mm512_unpacklo_epi64(pack12, pack34)); - // Cut the register into halves and sum those. 1 2 3 4 1 2 3 4 - __m256i halves = _mm256_add_epi32(_mm512_castsi512_si256(pack1234), - _mm512_extracti64x4_epi64(pack1234, (short)1)); - // Again: cut the register into halves and sum those. 1 2 3 4 - return _mm_add_epi32(_mm256_castsi256_si128(halves), - _mm256_extracti128_si256(halves, 1)); -} - -// Four sum version -inline __m128i Reduce16to32(__m512i sum1, - __m512i sum2, - __m512i sum3, - __m512i sum4) { - Convert32Sum(sum1); - Convert32Sum(sum2); - Convert32Sum(sum3); - Convert32Sum(sum4); - return Reduce32(sum1, sum2, sum3, sum4); -} - -// Somewhat inefficient reduce for single __m256i containing int32_t -inline int32_t Reduce32(__m256i halves) { - IntAccess a; - a.as_n = _mm_add_epi32(_mm256_castsi256_si128(halves), - _mm256_extracti128_si256(halves, 1)); - // TODO is there a more efficient way? - return a.as_i[0] + a.as_i[1] + a.as_i[2] + a.as_i[3]; -} - -// Somewhat inefficient reduce for single __m512i containing int32_t -inline int32_t Reduce32(__m512i sum1) { - // Fold register over itself. - return Reduce32(_mm256_add_epi32(_mm512_castsi512_si256(sum1), - _mm512_extracti64x4_epi64(sum1, (short)1))); -} - -inline int32_t Reduce16to32(__m512i sum1) { - Convert32Sum(sum1); - // Fold register over itself. - return Reduce32(_mm256_add_epi32(_mm512_castsi512_si256(sum1), - _mm512_extracti64x4_epi64(sum1, (short)1))); -} - -class ScatterPut { -public: - explicit ScatterPut(float unquant_mult, int num_B_rows) - : unquant_mult_(unquant_mult), - unquant_mult_sse_(_mm_set1_ps(unquant_mult)), -#ifdef __AVX512VL__ - num_b_rows_scatter_(_mm_set_epi32(num_B_rows * 3 * sizeof(float), - num_B_rows * 2 * sizeof(float), - num_B_rows * 1 * sizeof(float), - num_B_rows * 0 * sizeof(float))), -#endif - num_B_rows_(num_B_rows) { - } - - inline void Write(float *base, __m128i reduced) { - __m128 float_sums = _mm_cvtepi32_ps(reduced); - float_sums = _mm_mul_ps(float_sums, unquant_mult_sse_); -#ifdef __AVX512VL__ - // The scatter instruction requires avx512vl - _mm_i32scatter_ps(base, num_b_rows_scatter_, float_sums, (short)1); -#else - FloatAccess a; - // Get floats for each of the sums to write. - a.as_n = float_sums; - // Also note that the memory acceses on C are not consecutive, but this is a - // tradeoff that we have to make. We can't have consecutive accesses of A, - // B, *and* C. But we access A and B a lot more so it makes sense to do it - // this way. Scatter to outputs: - base[0] = a.as_f[0]; - base[num_B_rows_] = a.as_f[1]; - base[2 * num_B_rows_] = a.as_f[2]; - base[3 * num_B_rows_] = a.as_f[3]; -#endif - } - - inline void Write(float *base, ReducedPair reduced) { - base[0] = unquant_mult_ * static_cast(reduced.result[0]); - base[num_B_rows_] = unquant_mult_ * static_cast(reduced.result[1]); - } - - inline void Write(float *base, int32_t reduced) { - base[0] = unquant_mult_ * static_cast(reduced); - } - -private: - const float unquant_mult_; - const __m128 unquant_mult_sse_; -#ifdef __AVX512VL__ - const __m128i num_b_rows_scatter_; -#endif - const int num_B_rows_; -}; - -} // namespace - -// This is an AVX512F implementation of int16_t multiply based on Jacob -// Devlin's SSE code. The original SSE code was: - -// Copyright (c) 2017 Microsoft Corporation - -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -// We are multiplying A * B^T, as opposed to A * B. This is important because it -// means we can do consecutive memory access on A * B^T which allows to to take -// the most advantage of L1 cache. -// -// B is typically a weight matrix, so it can be pre-processed offline, and -// therefore this transpose does not cost anything. A is typically an activation -// minibatch matrix. A and B must be 64-byte aligned. C should be the usual -// 4-byte alignment. -void AVX_MatrixMult16(const __m512i *A, - const __m512i *B, - float *C, - float unquant_mult, - int num_A_rows, - int num_B_rows, - int width) { - assert(width % 32 == 0); - assert(reinterpret_cast(A) % 64 == 0); - assert(reinterpret_cast(B) % 64 == 0); - - ScatterPut put(unquant_mult, num_B_rows); - - const int sse_width = width / 32; - - // We do loop unrolling over A. This is *significantly* faster - // since B can live in the registers. We are assuming that - // A is a multiple of 4, but we can add extra code to handle values of 1, - // 2, 3. - // - // We could also do loop unrolling over B, which adds some additional speedup. - // We don't do that for the sake of clarity. - // - // There are other memory access patterns we could do, e.g., put B on the - // outer loop. The justification is that A is typically small enough that it - // can live in L1 cache. B is usually a larger weight matrix, so it might not - // be able to. However, we are using each element of B four times while it's - // still in a register, so caching is not as important. - - // Round down to a multiple of 4. - int num_unroll_rows = num_A_rows & ~3; - for(int i = 0; i < num_unroll_rows; i += 4) { - const __m512i *A1_row = A + (i + 0) * sse_width; - const __m512i *A2_row = A + (i + 1) * sse_width; - const __m512i *A3_row = A + (i + 2) * sse_width; - const __m512i *A4_row = A + (i + 3) * sse_width; - - for(int j = 0; j < num_B_rows; j++) { - const __m512i *B_row = B + j * sse_width; - - __m512i sum1 = _mm512_setzero_si512(); - __m512i sum2 = _mm512_setzero_si512(); - __m512i sum3 = _mm512_setzero_si512(); - __m512i sum4 = _mm512_setzero_si512(); - - // This is just a simple dot product, unrolled four ways. - for(int k = 0; k < sse_width; k++) { - __m512i b = *(B_row + k); - - __m512i a1 = *(A1_row + k); - __m512i a2 = *(A2_row + k); - __m512i a3 = *(A3_row + k); - __m512i a4 = *(A4_row + k); - - // madd_epi16 does multiply add on 8 16-bit integers and accumulates - // into a four 32-bit register. E.g., a1 = [f1, f2, f3, f4, f5, f6, f7, - // h8] (16-bit ints) b1 = [h1, h2, h3, h4, h5, h6, h7, h8] (16-bit ints) - // result = [f1*h1 + f2*h2, f3*h3 + f4*h4, f5*h5 + f6*h6, f7*h7 + f8*h8] - // (32-bit ints) Then add_epi32 just effectively does a += on these - // 32-bit integers. - sum1 = _mm512_add_epi32(sum1, _mm512_madd_epi16(b, a1)); - sum2 = _mm512_add_epi32(sum2, _mm512_madd_epi16(b, a2)); - sum3 = _mm512_add_epi32(sum3, _mm512_madd_epi16(b, a3)); - sum4 = _mm512_add_epi32(sum4, _mm512_madd_epi16(b, a4)); - } - put.Write(C + i * num_B_rows + j, Reduce32(sum1, sum2, sum3, sum4)); - } - } - // Handle the non-multiples of 4 rows. - // TODO: efficient version for 3 rows, 2 rows, etc. - for(int i = num_unroll_rows; i < num_A_rows; ++i) { - const __m512i *A1_row = A + i * sse_width; - for(int j = 0; j < num_B_rows; j++) { - __m512i sum1 = _mm512_setzero_si512(); - for(int k = 0; k < sse_width; k++) { - const __m512i *B_row = B + j * sse_width; - __m512i b = *(B_row + k); - __m512i a1 = *(A1_row + k); - sum1 = _mm512_add_epi32(sum1, _mm512_madd_epi16(b, a1)); - } - // TODO is there a more efficient way? - *(C + (i)*num_B_rows + j) - = unquant_mult * static_cast(Reduce32(sum1)); - } - } -} - -namespace { - -/* Three ways considered to apply sign bits: - * 1. Use 256-bit sign instruction: - * __m256i a_first = _mm256_sign_epi8(_mm512_castsi512_si256(a), - * _mm512_castsi512_si256(b)); - * __m256i a_second = _mm256_sign_epi8(_mm512_extracti64x4_epi64(a, 1), - * b_second); a = _mm512_inserti64x4(_mm512_castsi256_si512(a_first), a_second, - * 1); a = Concat(a_first, a_second); - * - * 2. Extract a mask and xor + 1 - * __mmask64 neg_mask _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128)); - * Use set1 to to build to_xor - * a = _mm512_xor_si512(a, to_xor) - * And add one: - * const __m512i ones8 = _mm512_set1_epi8(1); - * a = _mm512_mask_add_epi8(a, neg_mask, a, ones8); - * - * 3. Extract a mask and subtract from 0 - * In the outer loop on b: - * __mmask64 neg_mask _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128)) - * For each a: - * a = _mm512_mask_sub_epi8(a, neg_mask, _mm512_setzero_si512(), a); - * - * Finally, subtraction won the benchmark - */ -inline void Accum(const __m512i zeros, - __m512i a, - const __m512i b, - const __m512i b_positive, - const __mmask64 neg_mask, - __m512i &sum) { - // Apply sign bits. - a = _mm512_mask_sub_epi8(a, neg_mask, zeros, a); - // The magic 8-bit multiply then horizontal sum into 16-bit. - __m512i multiplied = _mm512_maddubs_epi16(b_positive, a); - // Now we have 16-bit results that are the sum of two multiplies. - // Choosing to approximate and do adds. - // Perhaps every so often we could accumulate by Convert32Sum - sum = _mm512_adds_epi16(sum, multiplied); - b; // make compiler happy -} - -} // namespace - -void AVX_MatrixMult8(const __m512i *A, - const __m512i *B, - float *C, - float unquant_mult, - int num_A_rows, - int num_B_rows, - int width) { - assert(width % 32 == 0); - assert(reinterpret_cast(A) % 64 == 0); - assert(reinterpret_cast(B) % 64 == 0); - ScatterPut put(unquant_mult, num_B_rows); - const __m512i zeros = _mm512_setzero_si512(); - - const int sse_width = width / 64; - int i = 0; - int mult8rows = num_A_rows & (~7); - - for(; i < mult8rows; i += 8) { - const __m512i *A1_row = A + (i + 0) * sse_width; - const __m512i *A2_row = A + (i + 1) * sse_width; - const __m512i *A3_row = A + (i + 2) * sse_width; - const __m512i *A4_row = A + (i + 3) * sse_width; - const __m512i *A5_row = A + (i + 4) * sse_width; - const __m512i *A6_row = A + (i + 5) * sse_width; - const __m512i *A7_row = A + (i + 6) * sse_width; - const __m512i *A8_row = A + (i + 7) * sse_width; - for(int j = 0; j < num_B_rows; j++) { - const __m512i *B_row = B + j * sse_width; - __m512i sum1 = _mm512_setzero_si512(); - __m512i sum2 = _mm512_setzero_si512(); - __m512i sum3 = _mm512_setzero_si512(); - __m512i sum4 = _mm512_setzero_si512(); - __m512i sum5 = _mm512_setzero_si512(); - __m512i sum6 = _mm512_setzero_si512(); - __m512i sum7 = _mm512_setzero_si512(); - __m512i sum8 = _mm512_setzero_si512(); - for(int k = 0; k < sse_width; k++) { - __m512i b = *(B_row + k); - __m512i b_positive = _mm512_abs_epi8(b); - /* Didn't seem to make a difference definining sign bits here vs at top - */ - __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128)); - Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1); - Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2); - Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3); - Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4); - Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5); - Accum(zeros, *(A6_row + k), b, b_positive, neg_mask, sum6); - Accum(zeros, *(A7_row + k), b, b_positive, neg_mask, sum7); - Accum(zeros, *(A8_row + k), b, b_positive, neg_mask, sum8); - } - put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4)); - put.Write(C + (i + 4) * num_B_rows + j, - Reduce16to32(sum5, sum6, sum7, sum8)); - } - } - - const __m512i *A1_row = A + (i + 0) * sse_width; - const __m512i *A2_row = A + (i + 1) * sse_width; - const __m512i *A3_row = A + (i + 2) * sse_width; - const __m512i *A4_row = A + (i + 3) * sse_width; - const __m512i *A5_row = A + (i + 4) * sse_width; - const __m512i *A6_row = A + (i + 5) * sse_width; - const __m512i *A7_row = A + (i + 6) * sse_width; - switch(num_A_rows & 7) { - case 7: - for(int j = 0; j < num_B_rows; j++) { - const __m512i *B_row = B + j * sse_width; - __m512i sum1 = _mm512_setzero_si512(); - __m512i sum2 = _mm512_setzero_si512(); - __m512i sum3 = _mm512_setzero_si512(); - __m512i sum4 = _mm512_setzero_si512(); - __m512i sum5 = _mm512_setzero_si512(); - __m512i sum6 = _mm512_setzero_si512(); - __m512i sum7 = _mm512_setzero_si512(); - for(int k = 0; k < sse_width; k++) { - __m512i b = *(B_row + k); - __m512i b_positive = _mm512_abs_epi8(b); - __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128)); - Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1); - Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2); - Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3); - Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4); - Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5); - Accum(zeros, *(A6_row + k), b, b_positive, neg_mask, sum6); - Accum(zeros, *(A7_row + k), b, b_positive, neg_mask, sum7); - } - put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4)); - put.Write(C + (i + 4) * num_B_rows + j, Reduce16to32(sum5, sum6)); - put.Write(C + (i + 6) * num_B_rows + j, Reduce16to32(sum7)); - } - /* fall through */ - case 6: - for(int j = 0; j < num_B_rows; j++) { - const __m512i *B_row = B + j * sse_width; - __m512i sum1 = _mm512_setzero_si512(); - __m512i sum2 = _mm512_setzero_si512(); - __m512i sum3 = _mm512_setzero_si512(); - __m512i sum4 = _mm512_setzero_si512(); - __m512i sum5 = _mm512_setzero_si512(); - __m512i sum6 = _mm512_setzero_si512(); - for(int k = 0; k < sse_width; k++) { - __m512i b = *(B_row + k); - __m512i b_positive = _mm512_abs_epi8(b); - __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128)); - Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1); - Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2); - Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3); - Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4); - Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5); - Accum(zeros, *(A6_row + k), b, b_positive, neg_mask, sum6); - } - put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4)); - put.Write(C + (i + 4) * num_B_rows + j, Reduce16to32(sum5, sum6)); - } - /* fall through */ - case 5: - for(int j = 0; j < num_B_rows; j++) { - const __m512i *B_row = B + j * sse_width; - __m512i sum1 = _mm512_setzero_si512(); - __m512i sum2 = _mm512_setzero_si512(); - __m512i sum3 = _mm512_setzero_si512(); - __m512i sum4 = _mm512_setzero_si512(); - __m512i sum5 = _mm512_setzero_si512(); - for(int k = 0; k < sse_width; k++) { - __m512i b = *(B_row + k); - __m512i b_positive = _mm512_abs_epi8(b); - __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128)); - Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1); - Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2); - Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3); - Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4); - Accum(zeros, *(A5_row + k), b, b_positive, neg_mask, sum5); - } - put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4)); - put.Write(C + (i + 4) * num_B_rows + j, Reduce16to32(sum5)); - } - /* fall through */ - case 4: - for(int j = 0; j < num_B_rows; j++) { - const __m512i *B_row = B + j * sse_width; - __m512i sum1 = _mm512_setzero_si512(); - __m512i sum2 = _mm512_setzero_si512(); - __m512i sum3 = _mm512_setzero_si512(); - __m512i sum4 = _mm512_setzero_si512(); - for(int k = 0; k < sse_width; k++) { - __m512i b = *(B_row + k); - __m512i b_positive = _mm512_abs_epi8(b); - __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128)); - Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1); - Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2); - Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3); - Accum(zeros, *(A4_row + k), b, b_positive, neg_mask, sum4); - } - put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2, sum3, sum4)); - } - /* fall through */ - case 3: - for(int j = 0; j < num_B_rows; j++) { - const __m512i *B_row = B + j * sse_width; - __m512i sum1 = _mm512_setzero_si512(); - __m512i sum2 = _mm512_setzero_si512(); - __m512i sum3 = _mm512_setzero_si512(); - for(int k = 0; k < sse_width; k++) { - __m512i b = *(B_row + k); - __m512i b_positive = _mm512_abs_epi8(b); - __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128)); - Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1); - Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2); - Accum(zeros, *(A3_row + k), b, b_positive, neg_mask, sum3); - } - put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2)); - put.Write(C + (i + 2) * num_B_rows + j, Reduce16to32(sum3)); - } - /* fall through */ - case 2: - for(int j = 0; j < num_B_rows; j++) { - const __m512i *B_row = B + j * sse_width; - __m512i sum1 = _mm512_setzero_si512(); - __m512i sum2 = _mm512_setzero_si512(); - for(int k = 0; k < sse_width; k++) { - __m512i b = *(B_row + k); - __m512i b_positive = _mm512_abs_epi8(b); - __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128)); - Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1); - Accum(zeros, *(A2_row + k), b, b_positive, neg_mask, sum2); - } - put.Write(C + i * num_B_rows + j, Reduce16to32(sum1, sum2)); - } - /* fall through */ - case 1: - for(int j = 0; j < num_B_rows; j++) { - const __m512i *B_row = B + j * sse_width; - __m512i sum1 = _mm512_setzero_si512(); - for(int k = 0; k < sse_width; k++) { - __m512i b = *(B_row + k); - __m512i b_positive = _mm512_abs_epi8(b); - __mmask64 neg_mask = _mm512_test_epi8_mask(b, _mm512_set1_epi8(-128)); - Accum(zeros, *(A1_row + k), b, b_positive, neg_mask, sum1); - } - put.Write(C + i * num_B_rows + j, Reduce16to32(sum1)); - } - } -} - -} // namespace int16 -} // namespace cpu -} // namespace marian -#endif diff --git a/src/tensors/cpu/sharp/int_gemm.cpp b/src/tensors/cpu/sharp/int_gemm.cpp deleted file mode 100644 index cdb7cbf07..000000000 --- a/src/tensors/cpu/sharp/int_gemm.cpp +++ /dev/null @@ -1,187 +0,0 @@ -#include "int_gemm.h" -#include "tensors/tensor_allocator.h" -#include "tensors/tensor_operators.h" - -#include -#include -#include -#include -#include -#include - -namespace marian { -namespace cpu { -namespace int16 { - -#ifdef __AVX512F__ -void AVX_Quantize16(const float* input, - int16_t* output, - float quant_mult, - std::size_t size); - -void AVX_Quantize8(const float* input, - int8_t* output, - float quant_mult, - std::size_t size); - -void AVX_MatrixMult16(const __m512i* A, - const __m512i* B, - float* C, - float unquant_mult, - int num_A_rows, - int num_B_rows, - int width); - -void AVX_MatrixMult8(const __m512i* A, - const __m512i* B, - float* C, - float unquant_mult, - int num_A_rows, - int num_B_rows, - int width); -#endif - -void SSE_Quantize16(const float* input, - __m128i* output, - float quant_mult, - int num_rows, - int width); - -void SSE_MatrixMult16(const __m128i* A, - const __m128i* B, - float* C, - float unquant_mult, - int num_A_rows, - int num_B_rows, - int width); - -void Quantize16(marian::Tensor out, - const marian::Tensor in, - float /*clipValue*/) { - float quant_mult = (float)pow(2.0, BITS); -#ifdef __AVX512F__ - AVX_Quantize16( - in->data(), out->data(), quant_mult, in->shape().elements()); -#else - int num_rows = in->shape().elements() / in->shape()[-1]; - int width = in->shape()[-1]; - SSE_Quantize16(in->data(), out->data<__m128i>(), quant_mult, num_rows, width); -#endif -} - -void Quantize8(marian::Tensor out, - const marian::Tensor in, - float clipValue) { -#ifdef __AVX512F__ - float quant_mult = 127.0f / clipValue; - AVX_Quantize8( - in->data(), out->data(), quant_mult, in->shape().elements()); -#else - out; in; clipValue; - ABORT("8-bit is currently only AVX512"); -#endif -} - -// This operates on floats after processing so doesn't care about int8_t vs -// int16_t. -void AddBias(marian::Tensor C, const marian::Tensor Bias) { - float* y = C->data(); - const float* x = C->data(); - const float* bias = Bias->data(); - - const int m = C->shape().elements() / C->shape()[-1]; - const int n = C->shape()[-1]; - - for(int j = 0; j < m; ++j) { - int i = 0; -#ifdef __AVX512F__ - int n16 = n & ~15; - for(; i < n16; i += 16) { - __m512 ai = _mm512_loadu_ps(x + j * n + i); - __m512 bi = _mm512_loadu_ps(bias + i); - __m512 yi = _mm512_add_ps(ai, bi); - _mm512_storeu_ps(y + j * n + i, yi); - } -#else - int n4 = (n / 4) * 4; - for(; i < n4; i += 4) { - __m128 ai = _mm_loadu_ps(x + j * n + i); - __m128 bi = _mm_loadu_ps(bias + i); - __m128 yi = _mm_add_ps(ai, bi); - _mm_storeu_ps(y + j * n + i, yi); - } -#endif - for(; i < n; i++) { - y[j * n + i] = x[j * n + i] + bias[i]; - } - } -} - -void ProdInt16(marian::Tensor C, - const marian::Tensor A, - const marian::Tensor B, - float scale) { - ABORT_IF(scale != 1, "Scale other than 1 not supported"); - - // @TODO: make this a parameter - float quant_mult = (float)pow(2.0, BITS); - - // If we quantize to n bits and then multiple the values together, the result - // will be quantized to n^2 bits. So we must divide by 1.0/(n^2) to get back - // the original value. - float unquant_mult = 1.0f / (quant_mult * quant_mult); - - float* fC = C->data(); - int num_A_rows = A->shape().elements() / A->shape()[-1]; - int num_B_rows = B->shape().elements() / B->shape()[-1]; - int width = B->shape()[-1]; -#ifdef __AVX512F__ - AVX_MatrixMult16(A->data<__m512i>(), - B->data<__m512i>(), - fC, - unquant_mult, - num_A_rows, - num_B_rows, - width); -#else - SSE_MatrixMult16(A->data<__m128i>(), - B->data<__m128i>(), - fC, - unquant_mult, - num_A_rows, - num_B_rows, - width); -#endif -} - -void ProdInt8(marian::Tensor C, - const marian::Tensor A, - const marian::Tensor B, - float scale, - float clipValue) { -#ifdef __AVX512F__ - // This would be easy... - ABORT_IF(scale != 1, "Scale other than 1 not supported"); - float quant_mult = 127.0f / clipValue; - float unquant_mult = 1.0f / (quant_mult * quant_mult); - - float* fC = C->data(); - int num_A_rows = A->shape().elements() / A->shape()[-1]; - int num_B_rows = B->shape().elements() / B->shape()[-1]; - int width = B->shape()[-1]; - AVX_MatrixMult8(A->data<__m512i>(), - B->data<__m512i>(), - fC, - unquant_mult, - num_A_rows, - num_B_rows, - width); -#else - C; A; B; scale; clipValue; - ABORT("8-bit is currently only AVX512"); -#endif -} - -} // namespace int16 -} // namespace cpu -} // namespace marian diff --git a/src/tensors/cpu/sharp/int_gemm.h b/src/tensors/cpu/sharp/int_gemm.h deleted file mode 100644 index 3ae231562..000000000 --- a/src/tensors/cpu/sharp/int_gemm.h +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once - -#include "tensors/tensor.h" - -namespace marian { -namespace cpu { -namespace int16 { - -const int BITS = 10; - -void Quantize16(marian::Tensor out, - const marian::Tensor in, - float /*clipValue*/); - -void Quantize8(marian::Tensor out, - const marian::Tensor in, - float clipValue); - -// This operates on floats after processing so doesn't care about int8_t vs -// int16_t. -void AddBias(marian::Tensor C, const marian::Tensor Bias); - -void ProdInt16(marian::Tensor C, - const marian::Tensor A, - const marian::Tensor B, - float scale); - -void ProdInt8(marian::Tensor C, - const marian::Tensor A, - const marian::Tensor B, - float scale, - float clipValue); - -} // namespace int16 -} // namespace cpu -} // namespace marian diff --git a/src/tensors/cpu/sharp/sse_gemm.cpp b/src/tensors/cpu/sharp/sse_gemm.cpp deleted file mode 100644 index ea6f25cba..000000000 --- a/src/tensors/cpu/sharp/sse_gemm.cpp +++ /dev/null @@ -1,341 +0,0 @@ -// Copyright (c) 2017 Microsoft Corporation - -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace marian { -namespace cpu { -namespace int16 { - -// This is a reference implementation of 16-bit matrix multiplication described -// in "Sharp Models on Dull Hardware: Fast and Accurate Neural Machine -// Translation Decoding on the CPU". This model is not as fast as the one in the -// paper, becuase it uses SSE2 instead of AVX2. AVX2 instructions are only -// available on more modern CPUs (Haswell or later). The only difference between -// SSE2 and AVX2 is that SSE operates on 128-bit vectors and AVX2 operates on -// 256-bit vecetors. So AVX2 can fit 16 16-bit integers intead of 8 8-bit -// integers. The algorithm is the same, you just replace these instructions with -// their 256-bit counterpart, i.e., _mm256_add_epi32, _mm256_madd_epi16, -// _mm256_hadd_epi32, ... Additional improvements can also be made from -// unrolling the for loop over num_B_rows in SSE_MatrixMult, which is not done -// here for clarity. - -// *************************************** -// ************** IMPORTANT ************** -// *************************************** -// The biggest "gotcha" when using this type of multiplication is dealing with -// overflow related to quantization. It is NOT enough to simply ensure that A -// and B fit into 16 bit integers. If A and B are quantized with $n$ bits, the -// result of multiplying them together will be quantized to $n^2$ bits. So if -// they are near the boundary of the 16-bit mark, then the result will be near -// 32-bits and overflow. However, if we use, say, n = 10 bits, then the product -// is 20 bits. This gives us 12 bits left over for the accumulation. So as long -// as the width of the common dimension is less than 2^12 = 4096, it is -// *impossible* to overflow. If we used, say, n = 12 bits, then we have -// 32-(12*2) = 8 bits left over. So we *could* overflow if width > 2^8. -// -// So, the tradeoff is between quantization precision and possibility of -// overflow. A good general value is 10 bits, since this gives high precision -// (precision is 1/2^10 ~= 0.001, which is more than what's needed for almost -// all neural nets), and cannot overflow unless the matrix width is > 4096. - -// This quantizes floating point values into fixed-point 16-bit integers. -// Effectively, we are performing an SSE version of float x = ...; int16_t y = -// (int16_t)(quant_mult*x); -// -// Except that the casting is saturated. However, you should always ensure that -// the input fits into a fixed range anyways. I.e., you should ensure that -// quant_mult*x fits into the range [-2^15, 2^15]. This should always be -// possible because the value you're quantizing will either be NN weights or NN -// activations, both of which can be clipped to a fixed range during training. - -void SSE_Quantize16(const float* input, - __m128i* output, - float quant_mult, - int num_rows, - int width) { - assert(width % 8 == 0); - - int num_input_chunks = width / 8; - - // Fill an SSE float with 4 copies of the quant mult - __m128 sse_quant_mult - = _mm_set_ps(quant_mult, quant_mult, quant_mult, quant_mult); - - for(int i = 0; i < num_rows; i++) { - const float* input_row = input + i * width; - __m128i* output_row = output + i * num_input_chunks; - for(int j = 0; j < num_input_chunks; j++) { - const float* x = input_row + j * 8; - // Process 8 floats at once, since each __m128i can contain 8 16-bit - // integers. - - // Load floats floats into SSE registers. - __m128 f_0 = _mm_loadu_ps(x); - __m128 f_1 = _mm_loadu_ps(x + 4); - - // Multiply by quantization factor (e.g., if quant_mult = 1000.0, 0.34291 - // --> 342.21) - __m128 m_0 = _mm_mul_ps(f_0, sse_quant_mult); - __m128 m_1 = _mm_mul_ps(f_1, sse_quant_mult); - - // Cast float to 32-bit int (e.g., 342.21 --> 342) - __m128i i_0 = _mm_cvtps_epi32(m_0); - __m128i i_1 = _mm_cvtps_epi32(m_1); - - // Cast 32-bit int to 16-bit int. You must ensure that these fit into the - // 16-bit range by clipping values during training. - *(output_row + j) = _mm_packs_epi32(i_0, i_1); - } - } -} - -// We are multiplying A * B^T, as opposed to A * B. This is important because it -// means we can do consecutive memory access on A * B^T which allows to to take -// the most advantage of L1 cache. -// -// B is typically a weight matrix, so it can be pre-processed offline, and -// therefore this transpose does not cost anything. A is typically an activation -// minibatch matrix. -void SSE_MatrixMult16(const __m128i* qA, - const __m128i* qB, - float* fC, - float unquant_mult, - int num_A_rows, - int num_B_rows, - int width) { - assert(width % 8 == 0); - - int sse_width = width / 8; - - // We do loop unrolling over A. This is *significantly* faster - // since B can live in the registers. We are assuming that - // A is a multiple of 4, but we can add extra code to handle values of 1, - // 2, 3. - // - // We could also do loop unrolling over B, which adds some additional speedup. - // We don't do that for the sake of clarity. - // - // There are other memory access patterns we could do, e.g., put B on the - // outer loop. The justification is that A is typically small enough that it - // can live in L1 cache. B is usually a larger weight matrix, so it might not - // be able to. However, we are using each element of B four times while it's - // still in a register, so caching is not as important. - - int mult4 = (num_A_rows / 4) * 4; - int rest = num_A_rows % 4; - - int i = 0; - for(; i < mult4; i += 4) { - const __m128i* A1_row = qA + (i + 0) * sse_width; - const __m128i* A2_row = qA + (i + 1) * sse_width; - const __m128i* A3_row = qA + (i + 2) * sse_width; - const __m128i* A4_row = qA + (i + 3) * sse_width; - - for(int j = 0; j < num_B_rows; j++) { - const __m128i* B_row = qB + j * sse_width; - - __m128i sum1 = _mm_setzero_si128(); - __m128i sum2 = _mm_setzero_si128(); - __m128i sum3 = _mm_setzero_si128(); - __m128i sum4 = _mm_setzero_si128(); - - // This is just a simple dot product, unrolled four ways. - for(int k = 0; k < sse_width; k++) { - __m128i b = *(B_row + k); - - __m128i a1 = *(A1_row + k); - __m128i a2 = *(A2_row + k); - __m128i a3 = *(A3_row + k); - __m128i a4 = *(A4_row + k); - - // _mm_madd_epi16 does multiply add on 8 16-bit integers and accumulates - // into a four 32-bit register. E.g., a1 = [f1, f2, f3, f4, f5, f6, f7, - // h8] (16-bit ints) b1 = [h1, h2, h3, h4, h5, h6, h7, h8] (16-bit ints) - // result = [f1*h1 + f2*h2, f3*h3 + f4*h4, f5*h5 + f6*h6, f7*h7 + f8*h8] - // (32-bit ints) Then _mm_add_epi32 just effectively does a += on these - // 32-bit integers. - sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1)); - sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(b, a2)); - sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(b, a3)); - sum4 = _mm_add_epi32(sum4, _mm_madd_epi16(b, a4)); - } - - // We now have each sum spread across 4 32-bit ints in SSE register, e.g., - // sum1 = [r1, r2, r3, r4]. We need to compute r1 + r2 + r3 + r4. - // - // This uses 'horizontal add' to do that efficiently. The first add gets - // us [r1 + r2, r2 + r3, r1 + r2, r2 + r3] Then the second gets us. [r1 + - // r2 + r2 + r3, r2 + r3 + r1 + r2, r1 + r2 + r2 + r3, r2 + r3 + r1 + r2] - // E.g., each 32-bit in contains the full sum. - sum1 = _mm_hadd_epi32(sum1, sum1); - sum1 = _mm_hadd_epi32(sum1, sum1); - sum2 = _mm_hadd_epi32(sum2, sum2); - sum2 = _mm_hadd_epi32(sum2, sum2); - sum3 = _mm_hadd_epi32(sum3, sum3); - sum3 = _mm_hadd_epi32(sum3, sum3); - sum4 = _mm_hadd_epi32(sum4, sum4); - sum4 = _mm_hadd_epi32(sum4, sum4); - - float* C1 = fC + (i + 0) * num_B_rows + j; - float* C2 = fC + (i + 1) * num_B_rows + j; - float* C3 = fC + (i + 2) * num_B_rows + j; - float* C4 = fC + (i + 3) * num_B_rows + j; - - // Now that we have the full sum in each 32-bit register, we convert them - // to an integer with _mm_cvtepi32_ps and take the first one with - // _mm_store_ss. We don't use an SSE instruction to unquantize, although - // we could. It doesn't really matter since most of the computation is in - // the above loop over the width. - // - // Also note that the memory acceses on C are not consecutive, but this is - // a tradeoff that we have to make. We can't have consecutive accesses of - // qA, qB, *and* C. But we access qA and qB a lot more so it makes sense - // to do it this way. - _mm_store_ss(C1, _mm_cvtepi32_ps(sum1)); - *(C1) *= unquant_mult; - - _mm_store_ss(C2, _mm_cvtepi32_ps(sum2)); - *(C2) *= unquant_mult; - - _mm_store_ss(C3, _mm_cvtepi32_ps(sum3)); - *(C3) *= unquant_mult; - - _mm_store_ss(C4, _mm_cvtepi32_ps(sum4)); - *(C4) *= unquant_mult; - } - } - if(rest == 1) { - const __m128i* A1_row = qA + (i + 0) * sse_width; - - for(int j = 0; j < num_B_rows; j++) { - const __m128i* B_row = qB + j * sse_width; - - __m128i sum1 = _mm_setzero_si128(); - - // This is just a simple dot product, unrolled four ways. - for(int k = 0; k < sse_width; k++) { - __m128i b = *(B_row + k); - - __m128i a1 = *(A1_row + k); - sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1)); - } - - sum1 = _mm_hadd_epi32(sum1, sum1); - sum1 = _mm_hadd_epi32(sum1, sum1); - - float* C1 = fC + (i + 0) * num_B_rows + j; - - _mm_store_ss(C1, _mm_cvtepi32_ps(sum1)); - *(C1) *= unquant_mult; - } - } else if(rest == 2) { - const __m128i* A1_row = qA + (i + 0) * sse_width; - const __m128i* A2_row = qA + (i + 1) * sse_width; - - for(int j = 0; j < num_B_rows; j++) { - const __m128i* B_row = qB + j * sse_width; - - __m128i sum1 = _mm_setzero_si128(); - __m128i sum2 = _mm_setzero_si128(); - - for(int k = 0; k < sse_width; k++) { - __m128i b = *(B_row + k); - - __m128i a1 = *(A1_row + k); - __m128i a2 = *(A2_row + k); - - sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1)); - sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(b, a2)); - } - - sum1 = _mm_hadd_epi32(sum1, sum1); - sum1 = _mm_hadd_epi32(sum1, sum1); - sum2 = _mm_hadd_epi32(sum2, sum2); - sum2 = _mm_hadd_epi32(sum2, sum2); - - float* C1 = fC + (i + 0) * num_B_rows + j; - float* C2 = fC + (i + 1) * num_B_rows + j; - - _mm_store_ss(C1, _mm_cvtepi32_ps(sum1)); - *(C1) *= unquant_mult; - - _mm_store_ss(C2, _mm_cvtepi32_ps(sum2)); - *(C2) *= unquant_mult; - } - } else if(rest == 3) { - const __m128i* A1_row = qA + (i + 0) * sse_width; - const __m128i* A2_row = qA + (i + 1) * sse_width; - const __m128i* A3_row = qA + (i + 2) * sse_width; - - for(int j = 0; j < num_B_rows; j++) { - const __m128i* B_row = qB + j * sse_width; - - __m128i sum1 = _mm_setzero_si128(); - __m128i sum2 = _mm_setzero_si128(); - __m128i sum3 = _mm_setzero_si128(); - - for(int k = 0; k < sse_width; k++) { - __m128i b = *(B_row + k); - - __m128i a1 = *(A1_row + k); - __m128i a2 = *(A2_row + k); - __m128i a3 = *(A3_row + k); - - sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(b, a1)); - sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(b, a2)); - sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(b, a3)); - } - - sum1 = _mm_hadd_epi32(sum1, sum1); - sum1 = _mm_hadd_epi32(sum1, sum1); - sum2 = _mm_hadd_epi32(sum2, sum2); - sum2 = _mm_hadd_epi32(sum2, sum2); - sum3 = _mm_hadd_epi32(sum3, sum3); - sum3 = _mm_hadd_epi32(sum3, sum3); - - float* C1 = fC + (i + 0) * num_B_rows + j; - float* C2 = fC + (i + 1) * num_B_rows + j; - float* C3 = fC + (i + 2) * num_B_rows + j; - - _mm_store_ss(C1, _mm_cvtepi32_ps(sum1)); - *(C1) *= unquant_mult; - - _mm_store_ss(C2, _mm_cvtepi32_ps(sum2)); - *(C2) *= unquant_mult; - - _mm_store_ss(C3, _mm_cvtepi32_ps(sum3)); - *(C3) *= unquant_mult; - } - } -} - -} // namespace int16 -} // namespace cpu -} // namespace marian diff --git a/src/tensors/gpu/backend.h b/src/tensors/gpu/backend.h index 9bb97174f..75cc604da 100644 --- a/src/tensors/gpu/backend.h +++ b/src/tensors/gpu/backend.h @@ -66,16 +66,6 @@ class Backend : public marian::Backend { CudaCompute getCudaComputeCapability() { return compute_; } - // for CPU, sets to use optimized code for inference. - // for GPU, this is invalid. for gpu, isOptimized() function always returns false. - void setOptimized(bool optimize) override { - LOG_ONCE(info, "setOptimized() not supported for GPU_{}", optimize); - } - - bool isOptimized() override { - return false; - } - private: cublasHandle_t cublasHandle_{0}; // make sure it's 0, so it can be initalized lazily cusparseHandle_t cusparseHandle_{0}; // as above diff --git a/src/tests/prod.cpp b/src/tests/prod.cpp index 698712b9c..aae9f5573 100644 --- a/src/tests/prod.cpp +++ b/src/tests/prod.cpp @@ -7,7 +7,9 @@ int main(int /*argc*/, char** /*argv*/) { { auto g = New(true); g->setDevice({0, DeviceType::cpu}); - g->getBackend()->setOptimized(false); +#if 0 // this file is not a real test, just used for manual stuff. Disable here by hand for now. + g->getBackend()->setInt16(false); +#endif g->reserveWorkspaceMB(2512); timer::AutoTimer timer; @@ -40,7 +42,44 @@ int main(int /*argc*/, char** /*argv*/) { { auto g = New(true); g->setDevice({0, DeviceType::cpu}); - g->getBackend()->setOptimized(true); +#if 0 + g->getBackend()->setInt16(true); +#endif + g->reserveWorkspaceMB(2512); + + timer::AutoTimer timer; + for(int i = 0; i < 100; ++i) { + g->clear(); + + auto x = g->constant({1, 4, 8, 256}, inits::glorotUniform()); + + auto W1 = g->param("W1", {256, 2048}, inits::glorotUniform()); + auto b1 = g->param("b1", {1, 2048}, inits::glorotUniform()); + + auto out = affine(x, W1, b1); + + for(int i = 2; i < 20; ++i) { + auto Wi = g->param("W" + std::to_string(i), {2048, 2048}, inits::glorotUniform()); + auto bi = g->param("b" + std::to_string(i), {1, 2048}, inits::glorotUniform()); + + out = relu(affine(out, Wi, bi)); + } + + auto Wn = g->param("Wn", {2048, 256}, inits::glorotUniform()); + auto bn = g->param("bn", {1, 256}, inits::glorotUniform()); + + auto y = affine(out, Wn, bn); + + g->forward(); + } + } + + { + auto g = New(true); + g->setDevice({0, DeviceType::cpu}); +#if 0 + g->getBackend()->setInt8(true); +#endif g->reserveWorkspaceMB(2512); timer::AutoTimer timer; diff --git a/src/training/graph_group_async.cpp b/src/training/graph_group_async.cpp index 4636f4b0c..e47074603 100644 --- a/src/training/graph_group_async.cpp +++ b/src/training/graph_group_async.cpp @@ -19,7 +19,6 @@ AsyncGraphGroup::AsyncGraphGroup(Ptr config, Ptr mpi) auto graph = New(); graph->setDevice(device); graph->setCheckpointing(options_->get("gradient-checkpointing")); - graph->getBackend()->setClip(options_->get("clip-gemm")); graph->reserveWorkspaceMB(options_->get("workspace")); graphs_.push_back(graph); shardOpt_.push_back(Optimizer(options_)); diff --git a/src/training/graph_group_singleton.h b/src/training/graph_group_singleton.h index 8a93af71a..74d384987 100644 --- a/src/training/graph_group_singleton.h +++ b/src/training/graph_group_singleton.h @@ -35,7 +35,6 @@ class SingletonGraph : public GraphGroup, public ExponentialSmoothing { graph_ = New(); graph_->setDevice(deviceId); graph_->setCheckpointing(options_->get("gradient-checkpointing")); - graph_->getBackend()->setClip(options_->get("clip-gemm")); graph_->reserveWorkspaceMB(options_->get("workspace")); opt_ = Optimizer(options_); builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training); diff --git a/src/training/graph_group_sync.cpp b/src/training/graph_group_sync.cpp index eaeefb42b..de6d5e5f0 100755 --- a/src/training/graph_group_sync.cpp +++ b/src/training/graph_group_sync.cpp @@ -12,7 +12,6 @@ SyncGraphGroup::SyncGraphGroup(Ptr config, Ptr mpi) graph->setDevice(device); graph->setCheckpointing(options_->get("gradient-checkpointing")); graph->reserveWorkspaceMB(options_->get("workspace")); - graph->getBackend()->setClip(options_->get("clip-gemm")); graphs_.push_back(graph); shardOpt_.push_back(Optimizer(options_)); diff --git a/src/translator/translator.h b/src/translator/translator.h index 15eb98702..1ff19a4ae 100755 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -87,10 +87,6 @@ class Translate : public ModelTask { auto prec = options_->get>("precision", {"float32"}); graph->setDefaultElementType(typeFromString(prec[0])); graph->setDevice(device); - graph->getBackend()->setClip(options_->get("clip-gemm")); - if (device.type == DeviceType::cpu) { - graph->getBackend()->setOptimized(options_->get("optimize")); - } graph->reserveWorkspaceMB(options_->get("workspace")); graphs_[id] = graph; @@ -229,10 +225,6 @@ class TranslateService : public ModelServiceTask { auto precison = options_->get>("precision", {"float32"}); graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph graph->setDevice(device); - graph->getBackend()->setClip(options_->get("clip-gemm")); - if (device.type == DeviceType::cpu) { - graph->getBackend()->setOptimized(options_->get("optimize")); - } graph->reserveWorkspaceMB(options_->get("workspace")); graphs_.push_back(graph);